diff --git a/.gitignore b/.gitignore index e88ccfc008c..9a281d67675 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ make_config.mk *.vcxproj.filters *.sln *.cmake +.watchmanconfig CMakeCache.txt CMakeFiles/ build/ @@ -32,6 +33,8 @@ ldb manifest_dump sst_dump blob_dump +block_cache_trace_analyzer +tools/block_cache_analyzer/*.pyc column_aware_encoding_exp util/build_version.cc build_tools/VALGRIND_LOGS/ @@ -47,6 +50,8 @@ rocksdb_undump db_test2 trace_analyzer trace_analyzer_test +block_cache_trace_analyzer +.DS_Store java/out java/target @@ -74,3 +79,4 @@ tp2/ fbcode/ fbcode buckifier/*.pyc +buckifier/__pycache__ diff --git a/.travis.yml b/.travis.yml index e759a642a0c..7af91483bca 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,4 @@ -sudo: false -dist: trusty +dist: xenial language: cpp os: - linux @@ -9,7 +8,7 @@ compiler: - gcc osx_image: xcode8.3 jdk: - - oraclejdk7 + - openjdk7 cache: - ccache - apt @@ -71,7 +70,10 @@ install: CC=gcc-8 && CXX=g++-8; fi - if [[ "${JOB_NAME}" == cmake* ]] && [ "${TRAVIS_OS_NAME}" == linux ]; then - mkdir cmake-dist && curl -sfSL https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH; + mkdir cmake-dist && curl -sfSL https://github.com/Kitware/CMake/releases/download/v3.14.5/cmake-3.14.5-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH; + fi + - if [[ "${JOB_NAME}" == java_test ]]; then + java -version && echo "JAVA_HOME=${JAVA_HOME}"; fi before_script: @@ -87,13 +89,13 @@ script: OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some ;; 1) - OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=full_filter_block_test make -j4 check_some + OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=db_iter_test make -j4 check_some ;; 2) - OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some + OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_iter_test ROCKSDBTESTS_END=options_file_test make -j4 check_some ;; 3) - OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_batch_with_index_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some + OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=options_file_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some ;; 4) OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_prepared_transaction_test make -j4 check_some @@ -101,7 +103,7 @@ script: esac - case $JOB_NAME in java_test) - OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest + OPT=-DTRAVIS V=1 make rocksdbjava jtest ;; lite_build) OPT='-DTRAVIS -DROCKSDB_LITE' V=1 make -j4 static_lib tools @@ -110,6 +112,7 @@ script: OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4 ;; cmake-mingw) + sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix; mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni ;; cmake*) @@ -119,5 +122,3 @@ script: notifications: email: - leveldb@fb.com - webhooks: - - https://buildtimetrend.herokuapp.com/travis diff --git a/.watchmanconfig b/.watchmanconfig new file mode 100644 index 00000000000..e5b450d7bbb --- /dev/null +++ b/.watchmanconfig @@ -0,0 +1,6 @@ +{ + "content_hash_warming": true, + "content_hash_max_items": 333333, + "hint_num_files_per_dir": 8, + "fsevents_latency": 0.05 +} diff --git a/CMakeLists.txt b/CMakeLists.txt index fb8067d2245..4da0b3628ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,17 +32,24 @@ # 3. cmake .. # 4. make -j -cmake_minimum_required(VERSION 2.8.12) -project(rocksdb) -enable_language(CXX) -enable_language(C) -enable_language(ASM) +cmake_minimum_required(VERSION 3.5.1) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") +include(ReadVersion) +get_rocksdb_version(rocksdb_VERSION) +project(rocksdb + VERSION ${rocksdb_VERSION} + LANGUAGES CXX C ASM) if(POLICY CMP0042) cmake_policy(SET CMP0042 NEW) endif() -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) option(WITH_JEMALLOC "build with JeMalloc" OFF) option(WITH_SNAPPY "build with SNAPPY" OFF) @@ -53,6 +60,13 @@ option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files, if (WITH_WINDOWS_UTF8_FILENAMES) add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES) endif() +# third-party/folly is only validated to work on Linux and Windows for now. +# So only turn it on there by default. +if(CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Windows") + option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" ON) +else() + option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" OFF) +endif() if(MSVC) # Defaults currently different for GFLAGS. # We will address find_package work a little later @@ -68,8 +82,7 @@ else() if(WITH_JEMALLOC) find_package(JeMalloc REQUIRED) add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE) - include_directories(${JEMALLOC_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${JEMALLOC_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS JeMalloc::JeMalloc) endif() endif() @@ -87,47 +100,43 @@ else() if(WITH_SNAPPY) find_package(snappy REQUIRED) add_definitions(-DSNAPPY) - include_directories(${SNAPPY_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${SNAPPY_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS snappy::snappy) endif() if(WITH_ZLIB) find_package(ZLIB REQUIRED) add_definitions(-DZLIB) - if(ZLIB_INCLUDE_DIRS) - # CMake 3 - include_directories(${ZLIB_INCLUDE_DIRS}) - else() - # CMake 2 - include_directories(${ZLIB_INCLUDE_DIR}) - endif() - list(APPEND THIRDPARTY_LIBS ${ZLIB_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS ZLIB::ZLIB) endif() option(WITH_BZ2 "build with bzip2" OFF) if(WITH_BZ2) - find_package(bzip2 REQUIRED) + find_package(BZip2 REQUIRED) add_definitions(-DBZIP2) - include_directories(${BZIP2_INCLUDE_DIR}) + if(BZIP2_INCLUDE_DIRS) + include_directories(${BZIP2_INCLUDE_DIRS}) + else() + include_directories(${BZIP2_INCLUDE_DIR}) + endif() list(APPEND THIRDPARTY_LIBS ${BZIP2_LIBRARIES}) endif() if(WITH_LZ4) find_package(lz4 REQUIRED) add_definitions(-DLZ4) - include_directories(${LZ4_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${LZ4_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS lz4::lz4) endif() if(WITH_ZSTD) find_package(zstd REQUIRED) add_definitions(-DZSTD) include_directories(${ZSTD_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${ZSTD_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS zstd::zstd) endif() endif() -string(TIMESTAMP GIT_DATE_TIME "%Y/%m/%d %H:%M:%S" UTC) +string(TIMESTAMP TS "%Y/%m/%d %H:%M:%S" UTC) +set(GIT_DATE_TIME "${TS}" CACHE STRING "the time we first built rocksdb") find_package(Git) @@ -144,17 +153,6 @@ endif() string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}") -# Read rocksdb version from version.h header file. -file(READ include/rocksdb/version.h version_header_file) -string(REGEX MATCH "#define ROCKSDB_MAJOR ([0-9]+)" _ ${version_header_file}) -set(ROCKSDB_VERSION_MAJOR ${CMAKE_MATCH_1}) -string(REGEX MATCH "#define ROCKSDB_MINOR ([0-9]+)" _ ${version_header_file}) -set(ROCKSDB_VERSION_MINOR ${CMAKE_MATCH_1}) -string(REGEX MATCH "#define ROCKSDB_PATCH ([0-9]+)" _ ${version_header_file}) -set(ROCKSDB_VERSION_PATCH ${CMAKE_MATCH_1}) -set(ROCKSDB_VERSION ${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH}) - - option(WITH_MD_LIBRARY "build with MD" ON) if(WIN32 AND MSVC) if(WITH_MD_LIBRARY) @@ -177,6 +175,7 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing") if(MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format") + add_definitions(-D_POSIX_C_SOURCE=1) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") @@ -201,6 +200,15 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") endif(HAS_ALTIVEC) endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC) + if(HAS_ARMV8_CRC) + message(STATUS " HAS_ARMV8_CRC yes") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function") + endif(HAS_ARMV8_CRC) +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + option(PORTABLE "build a portable binary" OFF) option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF) if(PORTABLE) @@ -213,7 +221,7 @@ else() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") else() - if(NOT HAVE_POWER8) + if(NOT HAVE_POWER8 AND NOT HAS_ARMV8_CRC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() endif() @@ -300,15 +308,14 @@ if(WITH_NUMA) find_package(NUMA REQUIRED) add_definitions(-DNUMA) include_directories(${NUMA_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${NUMA_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS NUMA::NUMA) endif() option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF) if(WITH_TBB) find_package(TBB REQUIRED) add_definitions(-DTBB) - include_directories(${TBB_INCLUDE_DIR}) - list(APPEND THIRDPARTY_LIBS ${TBB_LIBRARIES}) + list(APPEND THIRDPARTY_LIBS TBB::TBB) endif() # Stall notifications eat some performance from inserts @@ -317,6 +324,10 @@ if(DISABLE_STALL_NOTIF) add_definitions(-DROCKSDB_DISABLE_STALL_NOTIFICATION) endif() +option(WITH_DYNAMIC_EXTENSION "build with dynamic extension support" OFF) +if(NOT WITH_DYNAMIC_EXTENSION) + add_definitions(-DROCKSDB_NO_DYNAMIC_EXTENSION) +endif() if(DEFINED USE_RTTI) if(USE_RTTI) @@ -458,38 +469,41 @@ endif() include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/include) -include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src) +include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src) +if(WITH_FOLLY_DISTRIBUTED_MUTEX) + include_directories(${PROJECT_SOURCE_DIR}/third-party/folly) +endif() find_package(Threads REQUIRED) -add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest) - # Main library source code set(SOURCES cache/clock_cache.cc cache/lru_cache.cc cache/sharded_cache.cc + db/arena_wrapped_db_iter.cc db/builder.cc db/c.cc db/column_family.cc db/compacted_db_impl.cc - db/compaction.cc - db/compaction_iterator.cc - db/compaction_job.cc - db/compaction_picker.cc - db/compaction_picker_fifo.cc - db/compaction_picker_universal.cc + db/compaction/compaction.cc + db/compaction/compaction_iterator.cc + db/compaction/compaction_picker.cc + db/compaction/compaction_job.cc + db/compaction/compaction_picker_fifo.cc + db/compaction/compaction_picker_level.cc + db/compaction/compaction_picker_universal.cc db/convenience.cc db/db_filesnapshot.cc - db/db_impl.cc - db/db_impl_write.cc - db/db_impl_compaction_flush.cc - db/db_impl_files.cc - db/db_impl_open.cc - db/db_impl_debug.cc - db/db_impl_experimental.cc - db/db_impl_readonly.cc - db/db_impl_secondary.cc + db/db_impl/db_impl.cc + db/db_impl/db_impl_write.cc + db/db_impl/db_impl_compaction_flush.cc + db/db_impl/db_impl_files.cc + db/db_impl/db_impl_open.cc + db/db_impl/db_impl_debug.cc + db/db_impl/db_impl_experimental.cc + db/db_impl/db_impl_readonly.cc + db/db_impl/db_impl_secondary.cc db/db_info_dumper.cc db/db_iter.cc db/dbformat.cc @@ -501,8 +515,8 @@ set(SOURCES db/flush_job.cc db/flush_scheduler.cc db/forward_iterator.cc + db/import_column_family_job.cc db/internal_stats.cc - db/in_memory_stats_history.cc db/logs_with_prep_tracker.cc db/log_reader.cc db/log_writer.cc @@ -518,6 +532,7 @@ set(SOURCES db/table_cache.cc db/table_properties_collector.cc db/transaction_log_impl.cc + db/trim_history_scheduler.cc db/version_builder.cc db/version_edit.cc db/version_set.cc @@ -531,6 +546,22 @@ set(SOURCES env/env_encryption.cc env/env_hdfs.cc env/mock_env.cc + file/delete_scheduler.cc + file/file_prefetch_buffer.cc + file/file_util.cc + file/filename.cc + file/random_access_file_reader.cc + file/read_write_util.cc + file/readahead_raf.cc + file/sequence_file_reader.cc + file/sst_file_manager_impl.cc + file/writable_file_writer.cc + logging/auto_roll_logger.cc + logging/event_logger.cc + logging/log_buffer.cc + memory/arena.cc + memory/concurrent_arena.cc + memory/jemalloc_nodump_allocator.cc memtable/alloc_tracker.cc memtable/hash_linklist_rep.cc memtable/hash_skiplist_rep.cc @@ -539,10 +570,12 @@ set(SOURCES memtable/write_buffer_manager.cc monitoring/histogram.cc monitoring/histogram_windowing.cc + monitoring/in_memory_stats_history.cc monitoring/instrumented_mutex.cc monitoring/iostats_context.cc monitoring/perf_context.cc monitoring/perf_level.cc + monitoring/persistent_stats_history.cc monitoring/statistics.cc monitoring/thread_status_impl.cc monitoring/thread_status_updater.cc @@ -555,80 +588,73 @@ set(SOURCES options/options_parser.cc options/options_sanity_check.cc port/stack_trace.cc - table/adaptive_table_factory.cc - table/block.cc - table/block_based_filter_block.cc - table/block_based_table_builder.cc - table/block_based_table_factory.cc - table/block_based_table_reader.cc - table/block_builder.cc + table/adaptive/adaptive_table_factory.cc + table/block_based/block.cc + table/block_based/block_based_filter_block.cc + table/block_based/block_based_table_builder.cc + table/block_based/block_based_table_factory.cc + table/block_based/block_based_table_reader.cc + table/block_based/block_builder.cc + table/block_based/block_prefix_index.cc + table/block_based/data_block_hash_index.cc + table/block_based/data_block_footer.cc + table/block_based/filter_block_reader_common.cc + table/block_based/filter_policy.cc + table/block_based/flush_block_policy.cc + table/block_based/full_filter_block.cc + table/block_based/index_builder.cc + table/block_based/parsed_full_filter_block.cc + table/block_based/partitioned_filter_block.cc + table/block_based/uncompression_dict_reader.cc table/block_fetcher.cc - table/block_prefix_index.cc - table/bloom_block.cc - table/cuckoo_table_builder.cc - table/cuckoo_table_factory.cc - table/cuckoo_table_reader.cc - table/data_block_hash_index.cc - table/data_block_footer.cc - table/flush_block_policy.cc + table/cuckoo/cuckoo_table_builder.cc + table/cuckoo/cuckoo_table_factory.cc + table/cuckoo/cuckoo_table_reader.cc table/format.cc - table/full_filter_block.cc table/get_context.cc - table/index_builder.cc table/iterator.cc table/merging_iterator.cc table/meta_blocks.cc - table/partitioned_filter_block.cc table/persistent_cache_helper.cc - table/plain_table_builder.cc - table/plain_table_factory.cc - table/plain_table_index.cc - table/plain_table_key_coding.cc - table/plain_table_reader.cc + table/plain/plain_table_bloom.cc + table/plain/plain_table_builder.cc + table/plain/plain_table_factory.cc + table/plain/plain_table_index.cc + table/plain/plain_table_key_coding.cc + table/plain/plain_table_reader.cc table/sst_file_reader.cc table/sst_file_writer.cc table/table_properties.cc table/two_level_iterator.cc + test_util/sync_point.cc + test_util/sync_point_impl.cc + test_util/testutil.cc + test_util/transaction_test_util.cc + tools/block_cache_analyzer/block_cache_trace_analyzer.cc tools/db_bench_tool.cc tools/dump/db_dump_tool.cc tools/ldb_cmd.cc tools/ldb_tool.cc tools/sst_dump_tool.cc tools/trace_analyzer_tool.cc - util/arena.cc - util/auto_roll_logger.cc - util/bloom.cc + trace_replay/trace_replay.cc + trace_replay/block_cache_tracer.cc util/coding.cc util/compaction_job_stats_impl.cc util/comparator.cc util/compression_context_cache.cc - util/concurrent_arena.cc util/concurrent_task_limiter_impl.cc util/crc32c.cc - util/delete_scheduler.cc util/dynamic_bloom.cc - util/event_logger.cc - util/file_reader_writer.cc - util/file_util.cc - util/filename.cc - util/filter_policy.cc util/hash.cc - util/jemalloc_nodump_allocator.cc - util/log_buffer.cc util/murmurhash.cc util/random.cc util/rate_limiter.cc util/slice.cc - util/sst_file_manager_impl.cc util/status.cc util/string_util.cc - util/sync_point.cc - util/sync_point_impl.cc - util/testutil.cc util/thread_local.cc util/threadpool_imp.cc - util/trace_replay.cc - util/transaction_test_util.cc util/xxhash.cc utilities/backupable/backupable_db.cc utilities/blob_db/blob_compaction_filter.cc @@ -653,9 +679,11 @@ set(SOURCES utilities/merge_operators/bytesxor.cc utilities/merge_operators/max.cc utilities/merge_operators/put.cc + utilities/merge_operators/sortlist.cc utilities/merge_operators/string_append/stringappend.cc utilities/merge_operators/string_append/stringappend2.cc utilities/merge_operators/uint64add.cc + utilities/object_registry.cc utilities/option_change_migration/option_change_migration.cc utilities/options/options_util.cc utilities/persistent_cache/block_cache_tier.cc @@ -663,6 +691,7 @@ set(SOURCES utilities/persistent_cache/block_cache_tier_metadata.cc utilities/persistent_cache/persistent_cache_tier.cc utilities/persistent_cache/volatile_tier_impl.cc + utilities/simulator_cache/cache_simulator.cc utilities/simulator_cache/sim_cache.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc utilities/trace/file_trace_reader_writer.cc @@ -696,6 +725,11 @@ if(HAVE_POWER8) util/crc32c_ppc_asm.S) endif(HAVE_POWER8) +if(HAS_ARMV8_CRC) + list(APPEND SOURCES + util/crc32c_arm64.cc) +endif(HAS_ARMV8_CRC) + if(WIN32) list(APPEND SOURCES port/win/io_win.cc @@ -722,6 +756,15 @@ else() env/io_posix.cc) endif() +if(WITH_FOLLY_DISTRIBUTED_MUTEX) + list(APPEND SOURCES + third-party/folly/folly/detail/Futex.cpp + third-party/folly/folly/synchronization/AtomicNotification.cpp + third-party/folly/folly/synchronization/DistributedMutex.cpp + third-party/folly/folly/synchronization/ParkingLot.cpp + third-party/folly/folly/synchronization/WaitOptions.cpp) +endif() + set(ROCKSDB_STATIC_LIB rocksdb${ARTIFACT_SUFFIX}) set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX}) set(ROCKSDB_IMPORT_LIB ${ROCKSDB_SHARED_LIB}) @@ -734,7 +777,7 @@ if(WITH_LIBRADOS) endif() if(WIN32) - set(SYSTEM_LIBS ${SYSTEM_LIBS} Shlwapi.lib Rpcrt4.lib) + set(SYSTEM_LIBS ${SYSTEM_LIBS} shlwapi.lib rpcrt4.lib) set(LIBS ${ROCKSDB_STATIC_LIB} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) else() set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT}) @@ -745,8 +788,8 @@ else() ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES LINKER_LANGUAGE CXX - VERSION ${ROCKSDB_VERSION} - SOVERSION ${ROCKSDB_VERSION_MAJOR} + VERSION ${rocksdb_VERSION} + SOVERSION ${rocksdb_VERSION_MAJOR} CXX_STANDARD 11 OUTPUT_NAME "rocksdb") endif() @@ -801,7 +844,7 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS) write_basic_package_version_file( RocksDBConfigVersion.cmake - VERSION ${ROCKSDB_VERSION} + VERSION ${rocksdb_VERSION} COMPATIBILITY SameMajorVersion ) @@ -843,15 +886,16 @@ endif() option(WITH_TESTS "build with tests" ON) if(WITH_TESTS) + add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest) set(TESTS cache/cache_test.cc cache/lru_cache_test.cc db/column_family_test.cc db/compact_files_test.cc - db/compaction_iterator_test.cc - db/compaction_job_stats_test.cc - db/compaction_job_test.cc - db/compaction_picker_test.cc + db/compaction/compaction_job_stats_test.cc + db/compaction/compaction_job_test.cc + db/compaction/compaction_iterator_test.cc + db/compaction/compaction_picker_test.cc db/comparator_db_test.cc db/corruption_test.cc db/cuckoo_table_db_test.cc @@ -871,10 +915,11 @@ if(WITH_TESTS) db/db_log_iter_test.cc db/db_memtable_test.cc db/db_merge_operator_test.cc + db/db_merge_operand_test.cc db/db_options_test.cc db/db_properties_test.cc db/db_range_del_test.cc - db/db_secondary_test.cc + db/db_impl/db_secondary_test.cc db/db_sst_test.cc db/db_statistics_test.cc db/db_table_properties_test.cc @@ -918,37 +963,41 @@ if(WITH_TESTS) env/env_basic_test.cc env/env_test.cc env/mock_env_test.cc + file/delete_scheduler_test.cc + logging/auto_roll_logger_test.cc + logging/env_logger_test.cc + logging/event_logger_test.cc + memory/arena_test.cc memtable/inlineskiplist_test.cc memtable/skiplist_test.cc memtable/write_buffer_manager_test.cc monitoring/histogram_test.cc monitoring/iostats_context_test.cc monitoring/statistics_test.cc + monitoring/stats_history_test.cc options/options_settable_test.cc options/options_test.cc - table/block_based_filter_block_test.cc - table/block_test.cc + table/block_based/block_based_filter_block_test.cc + table/block_based/block_test.cc + table/block_based/data_block_hash_index_test.cc + table/block_based/full_filter_block_test.cc + table/block_based/partitioned_filter_block_test.cc table/cleanable_test.cc - table/cuckoo_table_builder_test.cc - table/cuckoo_table_reader_test.cc - table/data_block_hash_index_test.cc - table/full_filter_block_test.cc + table/cuckoo/cuckoo_table_builder_test.cc + table/cuckoo/cuckoo_table_reader_test.cc table/merger_test.cc table/sst_file_reader_test.cc table/table_test.cc + tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc tools/ldb_cmd_test.cc tools/reduce_levels_test.cc tools/sst_dump_test.cc tools/trace_analyzer_test.cc - util/arena_test.cc - util/auto_roll_logger_test.cc util/autovector_test.cc util/bloom_test.cc util/coding_test.cc util/crc32c_test.cc - util/delete_scheduler_test.cc util/dynamic_bloom_test.cc - util/event_logger_test.cc util/file_reader_writer_test.cc util/filelock_test.cc util/hash_test.cc @@ -973,6 +1022,7 @@ if(WITH_TESTS) utilities/options/options_util_test.cc utilities/persistent_cache/hash_table_test.cc utilities/persistent_cache/persistent_cache_test.cc + utilities/simulator_cache/cache_simulator_test.cc utilities/simulator_cache/sim_cache_test.cc utilities/table_properties_collectors/compact_on_deletion_collector_test.cc utilities/transactions/optimistic_transaction_test.cc @@ -986,14 +1036,19 @@ if(WITH_TESTS) list(APPEND TESTS utilities/env_librados_test.cc) endif() + if(WITH_FOLLY_DISTRIBUTED_MUTEX) + list(APPEND TESTS third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp) + endif() + set(BENCHMARKS cache/cache_bench.cc memtable/memtablerep_bench.cc db/range_del_aggregator_bench.cc tools/db_bench.cc table/table_reader_bench.cc + util/filter_bench.cc utilities/persistent_cache/hash_table_bench.cc) - add_library(testharness OBJECT util/testharness.cc) + add_library(testharness OBJECT test_util/testharness.cc) foreach(sourcefile ${BENCHMARKS}) get_filename_component(exename ${sourcefile} NAME_WE) add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile} @@ -1007,7 +1062,7 @@ if(WITH_TESTS) db/db_test_util.cc monitoring/thread_status_updater_debug.cc table/mock_table.cc - util/fault_injection_test_env.cc + test_util/fault_injection_test_env.cc utilities/cassandra/test_utils.cc ) # test utilities are only build in debug @@ -1015,6 +1070,7 @@ if(WITH_TESTS) add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX}) add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE}) + target_link_libraries(${TESTUTILLIB} ${LIBS}) if(MSVC) set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb") endif() diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 0a45f9bd5f0..d1abc700d28 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,3 +1,77 @@ # Code of Conduct -Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.facebook.com/codeofconduct) so that you can understand what actions will and will not be tolerated. +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq + diff --git a/HISTORY.md b/HISTORY.md index 66dd73965ec..3f1e7c9ae81 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,13 +1,184 @@ # Rocksdb Change Log -## Unreleased +## 6.6.0 (11/25/2019) +### Bug Fixes +* Fix data corruption casued by output of intra-L0 compaction on ingested file not being placed in correct order in L0. +* Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression. +* Fix a bug in DBIter that is_blob_ state isn't updated when iterating backward using seek. +* Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix. +* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound. +* Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found). +* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand. +* Fix OnFlushCompleted fired before flush result persisted in MANIFEST when there's concurrent flush job. The bug exists since OnFlushCompleted was introduced in rocksdb 3.8. +* Fixed an sst_dump crash on some plain table SST files. +* Fixed a memory leak in some error cases of opening plain table SST files. +* Fix a bug when a crash happens while calling WriteLevel0TableForRecovery for multiple column families, leading to a column family's log number greater than the first corrutped log number when the DB is being opened in PointInTime recovery mode during next recovery attempt (#5856). + +### New Features +* Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold. +* `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST. +* A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families. +* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false postive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key. +* Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014). +* Support options.ttl or options.periodic_compaction_seconds with options.max_open_files = -1. File's oldest ancester time and file creation time will be written to manifest. If it is availalbe, this information will be used instead of creation_time and file_creation_time in table properties. +* Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds. +* SstFileMetaData also returns file creation time and oldest ancester time. +* The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0. +* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior. +* When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references. +* `db_bench` now supports and by default issues non-TTL Puts to BlobDB. TTL Puts can be enabled by specifying a non-zero value for the `blob_db_max_ttl_range` command line parameter explicitly. +* `sst_dump` now supports printing BlobDB blob indexes in a human-readable format. This can be enabled by specifying the `decode_blob_index` flag on the command line. +* A number of new information elements are now exposed through the EventListener interface. For flushes, the file numbers of the new SST file and the oldest blob file referenced by the SST are propagated. For compactions, the level, file number, and the oldest blob file referenced are passed to the client for each compaction input and output file. + +### Public API Change +* RocksDB release 4.1 or older will not be able to open DB generated by the new release. 4.2 was released on Feb 23, 2016. +* TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs. +* With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days. +* Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB. +* FilterPolicy now exposes additional API to make it possible to choose filter configurations based on context, such as table level and compaction style. See `LevelAndStyleCustomFilterPolicy` in db_bloom_filter_test.cc. While most existing custom implementations of FilterPolicy should continue to work as before, those wrapping the return of NewBloomFilterPolicy will require overriding new function `GetBuilderWithContext()`, because calling `GetFilterBitsBuilder()` on the FilterPolicy returned by NewBloomFilterPolicy is no longer supported. +* An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy. +* NewBloomFilterPolicy now takes bits_per_key as a double instead of an int. This permits finer control over the memory vs. accuracy trade-off in the new Bloom filter implementation and should not change source code compatibility. +* The option BackupableDBOptions::max_valid_backups_to_open is now only used when opening BackupEngineReadOnly. When opening a read/write BackupEngine, anything but the default value logs a warning and is treated as the default. This change ensures that backup deletion has proper accounting of shared files to ensure they are deleted when no longer referenced by a backup. +* Deprecate `snap_refresh_nanos` option. +* Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction. +* Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode. +* Add a new Env::LoadEnv() overloaded function to return a shared_ptr to Env. +* Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB. + +### Default Option Changes +* Changed the default value of periodic_compaction_seconds to `UINT64_MAX - 1` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely. +* Changed the default value of ttl to `UINT64_MAX - 1` which allows RocksDB to auto-tune ttl value. When using the default value, TTL will be auto-enabled to 30 days, when the feature is supported. To revert the old behavior, you can explictly set it to 0. + +### Performance Improvements +* For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement. +* Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom. + +## 6.5.2 (11/15/2019) +### Bug Fixes +* Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache +* Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured. +* If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files. + +## 6.5.1 (10/16/2019) +### Bug Fixes +* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound. +* Fix a bug in BlockBasedTableIterator that might return incorrect results when reseek happens with a different iterator upper bound. +* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand. + +## 6.5.0 (9/13/2019) +### Bug Fixes +* Fixed a number of data races in BlobDB. +* Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0.. +* Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up. +* Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2. + +### New Features +* Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit. +* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting. +* When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process. +* Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details. + +### Public API Change +* Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables. +* Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine. +* The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format. + +### Performance Improvements +* Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance. + +## 6.4.0 (7/30/2019) +### Default Option Change +* LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explictly created, the small block cache created by BlockBasedTable will still has this option to be 0.0. +* Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true. + +### Public API Change +* Filter and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, filter and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed. +* Due to the above refactoring, block cache eviction statistics for filter and compression dictionary blocks are temporarily broken. We plan to reintroduce them in a later phase. +* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. +* Errors related to the retrieval of the compression dictionary are now propagated to the user. +* db_bench adds a "benchmark" stats_history, which prints out the whole stats history. +* Overload GetAllKeyVersions() to support non-default column family. +* Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469 +* ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator. +* Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env. +* Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors. +* LDBCommandRunner::RunCommand() to return the status code as an integer, rather than call exit() using the code. + +### New Features +* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. +* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks. +* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled. +* Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin. +* Support loading custom objects in unit tests. In the affected unit tests, RocksDB will create custom Env objects based on environment variable TEST_ENV_URI. Users need to make sure custom object types are properly registered. For example, a static library should expose a `RegisterCustomObjects` function. By linking the unit test binary with the static library, the unit test can execute this function. + +### Performance Improvements +* Reduce iterator key comparision for upper/lower bound check. +* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases. +* The compression dictionary is no longer copied to a new object upon retrieval. + +### Bug Fixes +* Fix ingested file and directory not being fsync. +* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator. +* Fixed a regression where the fill_cache read option also affected index blocks. +* Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes/filters as well. + +## 6.3.2 (8/15/2019) +### Public API Change +* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count. + +### Bug Fixes +* Fixed a regression where the fill_cache read option also affected index blocks. +* Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes as well. + +## 6.3.1 (7/24/2019) +### Bug Fixes +* Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails. + +## 6.3.0 (6/18/2019) +### Public API Change +* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released. +* Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers. +* Partitions of partitioned indexes no longer affect the read amplification statistics. +* Due to the above refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase. +* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set. +* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put. +* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit. +* Add C bindings for secondary instance, i.e. DBImplSecondary. +* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path + +### New Features +* Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. +* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. +* Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. +* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. +* Add command `list_file_range_deletes` in ldb, which prints out tombstones in SST files. + +### Performance Improvements +* Reduce binary search when iterator reseek into the same data block. +* DBIter::Next() can skip user key checking if previous entry's seqnum is 0. +* Merging iterator to avoid child iterator reseek for some cases +* Log Writer will flush after finishing the whole record, rather than a fragment. +* Lower MultiGet batching API latency by reading data blocks from disk in parallel + +### General Improvements +* Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress. +* Improve ColumnFamilyOptions validation when creating a new column family. + +### Bug Fixes +* Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. +* Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level. +* Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family. +* Fix a bug caused by secondary not skipping the beginning of new MANIFEST. +* On DB open, delete WAL trash files left behind in wal_dir + +## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator. * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level. * Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior. * When reading from option file/string/map, customized envs can be filled according to object registry. -* Add an option `snap_refresh_nanos` (default to 0.5s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator. +* Add index type BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It significantly reduces read amplification in some setups, especially for iterator seeks. It's not fully implemented yet: IO errors are not handled right. ### Public API Change * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. @@ -19,6 +190,7 @@ * Fix a race condition between WritePrepared::Get and ::Put with duplicate keys. * Fix crash when memtable prefix bloom is enabled and read/write a key out of domain of prefix extractor. * Close a WAL file before another thread deletes it. +* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag. ## 6.1.1 (4/9/2019) ### New Features diff --git a/Makefile b/Makefile index eee0f9fba02..3b0ddffee51 100644 --- a/Makefile +++ b/Makefile @@ -82,17 +82,23 @@ ifeq ($(MAKECMDGOALS),rocksdbjavastatic) endif ifeq ($(MAKECMDGOALS),rocksdbjavastaticrelease) - DEBUG_LEVEL=0 + ifneq ($(DEBUG_LEVEL),2) + DEBUG_LEVEL=0 + endif endif ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker) - DEBUG_LEVEL=0 + ifneq ($(DEBUG_LEVEL),2) + DEBUG_LEVEL=0 + endif endif ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish) DEBUG_LEVEL=0 endif +$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL}) + # Lite build flag. LITE ?= 0 ifeq ($(LITE), 0) @@ -137,6 +143,12 @@ CFLAGS += -DHAVE_POWER8 HAVE_POWER8=1 endif +ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc+crypto -xc /dev/null 2>&1)) +CXXFLAGS += -march=armv8-a+crc+crypto +CFLAGS += -march=armv8-a+crc+crypto +ARMCRC_SOURCE=1 +endif + # if we're compiling for release, compile without debug code (-DNDEBUG) ifeq ($(DEBUG_LEVEL),0) OPT += -DNDEBUG @@ -244,8 +256,8 @@ endif ifdef COMPILE_WITH_TSAN DISABLE_JEMALLOC=1 EXEC_LDFLAGS += -fsanitize=thread - PLATFORM_CCFLAGS += -fsanitize=thread -fPIC - PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC + PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DFOLLY_SANITIZE_THREAD + PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DFOLLY_SANITIZE_THREAD # Turn off -pg when enabling TSAN testing, because that induces # a link failure. TODO: find the root cause PROFILING_FLAGS = @@ -292,9 +304,13 @@ ifndef DISABLE_JEMALLOC PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) endif +ifndef USE_FOLLY_DISTRIBUTED_MUTEX + USE_FOLLY_DISTRIBUTED_MUTEX=0 +endif + export GTEST_THROW_ON_FAILURE=1 export GTEST_HAS_EXCEPTIONS=1 -GTEST_DIR = ./third-party/gtest-1.7.0/fused-src +GTEST_DIR = ./third-party/gtest-1.8.1/fused-src # AIX: pre-defined system headers are surrounded by an extern "C" block ifeq ($(PLATFORM), OS_AIX) PLATFORM_CCFLAGS += -I$(GTEST_DIR) @@ -304,6 +320,23 @@ else PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR) endif +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) + FOLLY_DIR = ./third-party/folly + # AIX: pre-defined system headers are surrounded by an extern "C" block + ifeq ($(PLATFORM), OS_AIX) + PLATFORM_CCFLAGS += -I$(FOLLY_DIR) + PLATFORM_CXXFLAGS += -I$(FOLLY_DIR) + else + PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR) + PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR) + endif +endif + +ifdef TEST_CACHE_LINE_SIZE + PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE) + PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE) +endif + # This (the first rule) must depend on "all". default: all @@ -390,10 +423,13 @@ endif LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o) MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o) +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) + FOLLYOBJECTS = $(FOLLY_SOURCES:.cpp=.o) +endif GTEST = $(GTEST_DIR)/gtest/gtest-all.o -TESTUTIL = ./util/testutil.o -TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST) +TESTUTIL = ./test_util/testutil.o +TESTHARNESS = ./test_util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST) VALGRIND_ERROR = 2 VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) @@ -403,6 +439,8 @@ BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL) ANALYZETOOLOBJECTS = $(ANALYZER_LIB_SOURCES:.cc=.o) +STRESSTOOLOBJECTS = $(STRESS_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL) + EXPOBJECTS = $(LIBOBJECTS) $(TESTUTIL) TESTS = \ @@ -420,6 +458,7 @@ TESTS = \ inlineskiplist_test \ env_basic_test \ env_test \ + env_logger_test \ hash_test \ thread_local_test \ rate_limiter_test \ @@ -429,10 +468,10 @@ TESTS = \ db_block_cache_test \ db_test \ db_blob_index_test \ - db_bloom_filter_test \ db_iter_test \ db_iter_stress_test \ db_log_iter_test \ + db_bloom_filter_test \ db_compaction_filter_test \ db_compaction_test \ db_dynamic_level_test \ @@ -441,6 +480,7 @@ TESTS = \ db_iterator_test \ db_memtable_test \ db_merge_operator_test \ + db_merge_operand_test \ db_options_test \ db_range_del_test \ db_secondary_test \ @@ -487,6 +527,7 @@ TESTS = \ plain_table_db_test \ comparator_db_test \ external_sst_file_test \ + import_column_family_test \ prefix_test \ skiplist_test \ write_buffer_manager_test \ @@ -497,6 +538,7 @@ TESTS = \ cassandra_serialize_test \ ttl_test \ backupable_db_test \ + cache_simulator_test \ sim_cache_test \ version_edit_test \ version_set_test \ @@ -536,6 +578,7 @@ TESTS = \ ldb_cmd_test \ persistent_cache_test \ statistics_test \ + stats_history_test \ lru_cache_test \ object_registry_test \ repair_test \ @@ -549,9 +592,16 @@ TESTS = \ range_del_aggregator_test \ sst_file_reader_test \ db_secondary_test \ + block_cache_tracer_test \ + block_cache_trace_analyzer_test \ + +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) + TESTS += folly_synchronization_distributed_mutex_test +endif PARALLEL_TEST = \ backupable_db_test \ + db_bloom_filter_test \ db_compaction_filter_test \ db_compaction_test \ db_merge_operator_test \ @@ -560,7 +610,9 @@ PARALLEL_TEST = \ db_universal_compaction_test \ db_wal_test \ external_sst_file_test \ + import_column_family_test \ fault_injection_test \ + file_reader_writer_test \ inlineskiplist_test \ manual_compaction_test \ persistent_cache_test \ @@ -593,12 +645,13 @@ TOOLS = \ rocksdb_undump \ blob_dump \ trace_analyzer \ + block_cache_trace_analyzer \ TEST_LIBS = \ librocksdb_env_basic_test.a # TODO: add back forward_iterator_bench, after making it build in all environemnts. -BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench persistent_cache_bench range_del_aggregator_bench +BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench filter_bench persistent_cache_bench range_del_aggregator_bench # if user didn't config LIBNAME, set the default ifeq ($(LIBNAME),) @@ -611,6 +664,7 @@ endif endif LIBRARY = ${LIBNAME}.a TOOLS_LIBRARY = ${LIBNAME}_tools.a +STRESS_LIBRARY = ${LIBNAME}_stress.a ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) @@ -700,6 +754,8 @@ static_lib: $(LIBRARY) shared_lib: $(SHARED) +stress_lib: $(STRESS_LIBRARY) + tools: $(TOOLS) tools_lib: $(TOOLS_LIBRARY) @@ -904,7 +960,7 @@ blackbox_crash_test: db_stress python -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS) blackbox_crash_test_with_atomic_flush: db_stress - python -u tools/db_crashtest.py --enable_atomic_flush blackbox $(CRASH_TEST_EXT_ARGS) + python -u tools/db_crashtest.py --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS) ifeq ($(CRASH_TEST_KILL_ODD),) CRASH_TEST_KILL_ODD=888887 @@ -917,7 +973,7 @@ whitebox_crash_test: db_stress $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) whitebox_crash_test_with_atomic_flush: db_stress - python -u tools/db_crashtest.py --enable_atomic_flush whitebox --random_kill_odd \ + python -u tools/db_crashtest.py --cf_consistency whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) asan_check: @@ -1047,14 +1103,23 @@ unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) $(TOOLLIBOBJECTS) unit rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc -clean: - rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED) +clean: clean-ext-libraries-all clean-rocks + +clean-not-downloaded: clean-ext-libraries-bin clean-rocks + +clean-rocks: + rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(LIBRARY) $(SHARED) rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report $(FIND) . -name "*.[oda]" -exec rm -f {} \; $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; - rm -rf bzip2* snappy* zlib* lz4* zstd* cd java; $(MAKE) clean +clean-ext-libraries-all: + rm -rf bzip2* snappy* zlib* lz4* zstd* + +clean-ext-libraries-bin: + find . -maxdepth 1 -type d \( -name bzip2\* -or -name snappy\* -or -name zlib\* -or -name lz4\* -or -name zstd\* \) -prune -exec rm -rf {} \; + tags: ctags -R . cscope -b `$(FIND) . -name '*.cc'` `$(FIND) . -name '*.h'` `$(FIND) . -name '*.c'` @@ -1084,6 +1149,10 @@ $(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_S $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ +$(STRESS_LIBRARY): $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o) $(STRESS_LIB_SOURCES:.cc=.o) + $(AM_V_AR)rm -f $@ + $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ + librocksdb_env_basic_test.a: env/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ $^ @@ -1094,6 +1163,14 @@ db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS) trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) $(AM_LINK) +block_cache_trace_analyzer: tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS) + $(AM_LINK) + +ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) +folly_synchronization_distributed_mutex_test: $(LIBOBJECTS) $(TESTHARNESS) $(FOLLYOBJECTS) third-party/folly/folly/synchronization/test/DistributedMutexTest.o + $(AM_LINK) +endif + cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) @@ -1103,7 +1180,10 @@ persistent_cache_bench: utilities/persistent_cache/persistent_cache_bench.o $(LI memtablerep_bench: memtable/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) -db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) +filter_bench: util/filter_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(AM_LINK) + +db_stress: tools/db_stress.o $(STRESSTOOLOBJECTS) $(AM_LINK) write_stress: tools/write_stress.o $(LIBOBJECTS) $(TESTUTIL) @@ -1115,7 +1195,7 @@ db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(AM_LINK) -arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) +arena_test: memory/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1172,7 +1252,7 @@ histogram_test: monitoring/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) +corruption_test: db/corruption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1229,6 +1309,9 @@ db_memtable_test: db/db_memtable_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHA db_merge_operator_test: db/db_merge_operator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +db_merge_operand_test: db/db_merge_operand_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -1253,6 +1336,9 @@ external_sst_file_basic_test: db/external_sst_file_basic_test.o db/db_test_util. external_sst_file_test: db/external_sst_file_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +import_column_family_test: db/import_column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -1301,6 +1387,9 @@ backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TE checkpoint_test: utilities/checkpoint/checkpoint_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +cache_simulator_test: utilities/simulator_cache/cache_simulator_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -1327,13 +1416,13 @@ write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_i flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -compaction_iterator_test: db/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_iterator_test: db/compaction/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_job_test: db/compaction/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -compaction_job_stats_test: db/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_job_stats_test: db/compaction/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) compact_on_deletion_collector_test: utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1357,7 +1446,7 @@ fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS) rate_limiter_test: util/rate_limiter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -delete_scheduler_test: util/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS) +delete_scheduler_test: file/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1366,13 +1455,13 @@ filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +block_based_filter_block_test: table/block_based/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +full_filter_block_test: table/block_based/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -partitioned_filter_block_test: table/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) +partitioned_filter_block_test: table/block_based/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1384,10 +1473,10 @@ cleanable_test: table/cleanable_test.o $(LIBOBJECTS) $(TESTHARNESS) table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) +block_test: table/block_based/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -data_block_hash_index_test: table/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) +data_block_hash_index_test: table/block_based/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1405,7 +1494,7 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) +compaction_picker_test: db/compaction/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1441,10 +1530,10 @@ util_merge_operators_test: utilities/util_merge_operators_test.o $(LIBOBJECTS) $ options_file_test: db/options_file_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) +deletefile_test: db/deletefile_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -obsolete_files_test: db/obsolete_files_test.o $(LIBOBJECTS) $(TESTHARNESS) +obsolete_files_test: db/obsolete_files_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS) @@ -1453,10 +1542,10 @@ rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS) rocksdb_undump: tools/dump/rocksdb_undump.o $(LIBOBJECTS) $(AM_LINK) -cuckoo_table_builder_test: table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) +cuckoo_table_builder_test: table/cuckoo/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) +cuckoo_table_reader_test: table/cuckoo/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1486,7 +1575,7 @@ db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS) $(AM_LINK) -event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +event_logger_test: logging/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1507,7 +1596,10 @@ manual_compaction_test: db/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +auto_roll_logger_test: logging/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +env_logger_test: logging/env_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -1552,6 +1644,9 @@ persistent_cache_test: utilities/persistent_cache/persistent_cache_test.o db/db statistics_test: monitoring/statistics_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +stats_history_test: monitoring/stats_history_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + lru_cache_test: cache/lru_cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -1573,7 +1668,13 @@ range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -db_secondary_test: db/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) +db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +block_cache_trace_analyzer_test: tools/block_cache_analyzer/block_cache_trace_analyzer_test.o tools/block_cache_analyzer/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) #------------------------------------------------- @@ -1621,7 +1722,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux ifeq ($(PLATFORM), OS_SOLARIS) ARCH := $(shell isainfo -b) else ifeq ($(PLATFORM), OS_OPENBSD) - ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64)) + ifneq (,$(filter amd64 ppc64 ppc64le arm64 aarch64 sparc64, $(MACHINE))) ARCH := 64 else ARCH := 32 @@ -1630,12 +1731,23 @@ else ARCH := $(shell getconf LONG_BIT) endif -ifeq (,$(findstring ppc,$(MACHINE))) - ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so +ifeq ($(shell ldd /usr/bin/env 2>/dev/null | grep -q musl; echo $$?),0) + JNI_LIBC = musl +# GNU LibC (or glibc) is so pervasive we can assume it is the default +# else +# JNI_LIBC = glibc +endif + +ifneq ($(origin JNI_LIBC), undefined) + JNI_LIBC_POSTFIX = -$(JNI_LIBC) +endif + +ifneq (,$(filter ppc% arm64 aarch64 sparc64, $(MACHINE))) + ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so else - ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so + ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so endif -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar +ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar @@ -1646,15 +1758,15 @@ ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1 ZLIB_DOWNLOAD_BASE ?= http://zlib.net BZIP2_VER ?= 1.0.6 BZIP2_SHA256 ?= a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd -BZIP2_DOWNLOAD_BASE ?= https://web.archive.org/web/20180624184835/http://www.bzip.org +BZIP2_DOWNLOAD_BASE ?= https://downloads.sourceforge.net/project/bzip2 SNAPPY_VER ?= 1.1.7 SNAPPY_SHA256 ?= 3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4 SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive -LZ4_VER ?= 1.8.3 -LZ4_SHA256 ?= 33af5936ac06536805f9745e0b6d61da606a1f8b4cc5c04dd3cbaca3b9b4fc43 +LZ4_VER ?= 1.9.2 +LZ4_SHA256 ?= 658ba6191fa44c92280d4aa2c271b0f4fbc0e34d249578dd05e50e76d0e5efcc LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive -ZSTD_VER ?= 1.3.7 -ZSTD_SHA256 ?= 5dd1e90eb16c25425880c8a91327f63de22891ffed082fcc17e5ae84fce0d5fb +ZSTD_VER ?= 1.4.4 +ZSTD_SHA256 ?= a364f5162c7d1a455cc915e8e3cf5f4bd8b75d09bc0f53965b0c9ca1383c52c8 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive CURL_SSL_OPTS ?= --tlsv1 @@ -1708,7 +1820,7 @@ endif libbz2.a: -rm -rf bzip2-$(BZIP2_VER) ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz)) - curl --output bzip2-$(BZIP2_VER).tar.gz -L ${BZIP2_DOWNLOAD_BASE}/$(BZIP2_VER)/bzip2-$(BZIP2_VER).tar.gz + curl --output bzip2-$(BZIP2_VER).tar.gz -L ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz endif BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \ if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \ @@ -1809,39 +1921,47 @@ rocksdbjavastatic: $(java_static_all_libobjects) cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org rocksdbjavastaticrelease: rocksdbjavastatic - cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 + cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class -rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 +rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class rocksdbjavastaticdockerx86: mkdir -p java/target - DOCKER_LINUX_X86_CONTAINER=`docker ps -aqf name=rocksdb_linux_x86-be`; \ - if [ -z "$$DOCKER_LINUX_X86_CONTAINER" ]; then \ - docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x86-be evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ - fi - docker start -a rocksdb_linux_x86-be + docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticdockerx86_64: mkdir -p java/target - DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \ - if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \ - docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ - fi - docker start -a rocksdb_linux_x64-be + docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticdockerppc64le: mkdir -p java/target - DOCKER_LINUX_PPC64LE_CONTAINER=`docker ps -aqf name=rocksdb_linux_ppc64le-be`; \ - if [ -z "$$DOCKER_LINUX_PPC64LE_CONTAINER" ]; then \ - docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_ppc64le-be evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \ - fi - docker start -a rocksdb_linux_ppc64le-be + docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + +rocksdbjavastaticdockerarm64v8: + mkdir -p java/target + docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + +rocksdbjavastaticdockerx86musl: + mkdir -p java/target + docker run --rm --name rocksdb_linux_x86-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + +rocksdbjavastaticdockerx86_64musl: + mkdir -p java/target + docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + +rocksdbjavastaticdockerppc64lemusl: + mkdir -p java/target + docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh + +rocksdbjavastaticdockerarm64v8musl: + mkdir -p java/target + docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral @@ -1852,6 +1972,8 @@ rocksdbjavastaticpublishcentral: mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64 mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32 + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64-musl.jar -Dclassifier=linux64-musl + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32-musl.jar -Dclassifier=linux32-musl mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-win64.jar -Dclassifier=win64 mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar @@ -1959,7 +2081,7 @@ endif # Source files dependencies detection # --------------------------------------------------------------------------- -all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) +all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) DEPFILES = $(all_sources:.cc=.cc.d) # Add proper dependency support so changing a .h file forces a .cc file to diff --git a/README.md b/README.md index f1bc0c05f3c..9ef21ee57df 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,11 @@ It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by S and Jeff Dean (jeff@google.com) This code is a library that forms the core building block for a fast -key value server, especially suited for storing data on flash drives. +key-value server, especially suited for storing data on flash drives. It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF) and Space-Amplification-Factor (SAF). It has multi-threaded compactions, -making it specially suitable for storing multiple terabytes of data in a +making it especially suitable for storing multiple terabytes of data in a single database. Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples diff --git a/TARGETS b/TARGETS index 073c977e5ad..ab1f24cd76c 100644 --- a/TARGETS +++ b/TARGETS @@ -1,3 +1,8 @@ +# This file @generated by `python buckifier/buckify_rocksdb.py` +# --> DO NOT EDIT MANUALLY <-- +# This file is a Facebook-specific integration for buck builds, so can +# only be validated by Facebook employees. +# load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders") load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") load(":defs.bzl", "test_binary") @@ -6,30 +11,11 @@ REPO_PATH = package_name() + "/" ROCKSDB_COMPILER_FLAGS = [ "-fno-builtin-memcmp", - "-DROCKSDB_PLATFORM_POSIX", - "-DROCKSDB_LIB_IO_POSIX", - "-DROCKSDB_FALLOCATE_PRESENT", - "-DROCKSDB_MALLOC_USABLE_SIZE", - "-DROCKSDB_RANGESYNC_PRESENT", - "-DROCKSDB_SCHED_GETCPU_PRESENT", - "-DROCKSDB_SUPPORT_THREAD_LOCAL", - "-DOS_LINUX", - # Flags to enable libs we include - "-DSNAPPY", - "-DZLIB", - "-DBZIP2", - "-DLZ4", - "-DZSTD", - "-DZSTD_STATIC_LINKING_ONLY", - "-DGFLAGS=gflags", - "-DNUMA", - "-DTBB", # Needed to compile in fbcode "-Wno-expansion-to-defined", # Added missing flags from output of build_detect_platform - "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", - "-DROCKSDB_BACKTRACE", "-Wnarrowing", + "-DROCKSDB_NO_DYNAMIC_EXTENSION", ] ROCKSDB_EXTERNAL_DEPS = [ @@ -40,11 +26,54 @@ ROCKSDB_EXTERNAL_DEPS = [ ("lz4", None, "lz4"), ("zstd", None), ("tbb", None), - ("numa", None, "numa"), ("googletest", None, "gtest"), ] +ROCKSDB_OS_DEPS = [ + ( + "linux", + ["third-party//numa:numa"], + ), +] + +ROCKSDB_OS_PREPROCESSOR_FLAGS = [ + ( + "linux", + [ + "-DOS_LINUX", + "-DROCKSDB_FALLOCATE_PRESENT", + "-DROCKSDB_MALLOC_USABLE_SIZE", + "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", + "-DROCKSDB_RANGESYNC_PRESENT", + "-DROCKSDB_SCHED_GETCPU_PRESENT", + "-DHAVE_SSE42", + "-DNUMA", + ], + ), + ( + "macos", + ["-DOS_MACOSX"], + ), +] + ROCKSDB_PREPROCESSOR_FLAGS = [ + "-DROCKSDB_PLATFORM_POSIX", + "-DROCKSDB_LIB_IO_POSIX", + "-DROCKSDB_SUPPORT_THREAD_LOCAL", + + # Flags to enable libs we include + "-DSNAPPY", + "-DZLIB", + "-DBZIP2", + "-DLZ4", + "-DZSTD", + "-DZSTD_STATIC_LINKING_ONLY", + "-DGFLAGS=gflags", + "-DTBB", + + # Added missing flags from output of build_detect_platform + "-DROCKSDB_BACKTRACE", + # Directories with files for #include "-I" + REPO_PATH + "include/", "-I" + REPO_PATH, @@ -52,7 +81,6 @@ ROCKSDB_PREPROCESSOR_FLAGS = [ ROCKSDB_ARCH_PREPROCESSOR_FLAGS = { "x86_64": [ - "-DHAVE_SSE42", "-DHAVE_PCLMUL", ], } @@ -69,9 +97,15 @@ sanitizer = read_config("fbcode", "sanitizer") # Do not enable jemalloc if sanitizer presents. RocksDB will further detect # whether the binary is linked with jemalloc at runtime. -ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else []) +ROCKSDB_OS_PREPROCESSOR_FLAGS += ([( + "linux", + ["-DROCKSDB_JEMALLOC"], +)] if sanitizer == "" else []) -ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else []) +ROCKSDB_OS_DEPS += ([( + "linux", + ["third-party//jemalloc:headers"], +)] if sanitizer == "" else []) cpp_library( name = "rocksdb_lib", @@ -79,27 +113,29 @@ cpp_library( "cache/clock_cache.cc", "cache/lru_cache.cc", "cache/sharded_cache.cc", + "db/arena_wrapped_db_iter.cc", "db/builder.cc", "db/c.cc", "db/column_family.cc", "db/compacted_db_impl.cc", - "db/compaction.cc", - "db/compaction_iterator.cc", - "db/compaction_job.cc", - "db/compaction_picker.cc", - "db/compaction_picker_fifo.cc", - "db/compaction_picker_universal.cc", + "db/compaction/compaction.cc", + "db/compaction/compaction_iterator.cc", + "db/compaction/compaction_job.cc", + "db/compaction/compaction_picker.cc", + "db/compaction/compaction_picker_fifo.cc", + "db/compaction/compaction_picker_level.cc", + "db/compaction/compaction_picker_universal.cc", "db/convenience.cc", "db/db_filesnapshot.cc", - "db/db_impl.cc", - "db/db_impl_compaction_flush.cc", - "db/db_impl_debug.cc", - "db/db_impl_experimental.cc", - "db/db_impl_files.cc", - "db/db_impl_open.cc", - "db/db_impl_readonly.cc", - "db/db_impl_secondary.cc", - "db/db_impl_write.cc", + "db/db_impl/db_impl.cc", + "db/db_impl/db_impl_compaction_flush.cc", + "db/db_impl/db_impl_debug.cc", + "db/db_impl/db_impl_experimental.cc", + "db/db_impl/db_impl_files.cc", + "db/db_impl/db_impl_open.cc", + "db/db_impl/db_impl_readonly.cc", + "db/db_impl/db_impl_secondary.cc", + "db/db_impl/db_impl_write.cc", "db/db_info_dumper.cc", "db/db_iter.cc", "db/dbformat.cc", @@ -111,7 +147,7 @@ cpp_library( "db/flush_job.cc", "db/flush_scheduler.cc", "db/forward_iterator.cc", - "db/in_memory_stats_history.cc", + "db/import_column_family_job.cc", "db/internal_stats.cc", "db/log_reader.cc", "db/log_writer.cc", @@ -128,6 +164,7 @@ cpp_library( "db/table_cache.cc", "db/table_properties_collector.cc", "db/transaction_log_impl.cc", + "db/trim_history_scheduler.cc", "db/version_builder.cc", "db/version_edit.cc", "db/version_set.cc", @@ -143,6 +180,22 @@ cpp_library( "env/env_posix.cc", "env/io_posix.cc", "env/mock_env.cc", + "file/delete_scheduler.cc", + "file/file_prefetch_buffer.cc", + "file/file_util.cc", + "file/filename.cc", + "file/random_access_file_reader.cc", + "file/read_write_util.cc", + "file/readahead_raf.cc", + "file/sequence_file_reader.cc", + "file/sst_file_manager_impl.cc", + "file/writable_file_writer.cc", + "logging/auto_roll_logger.cc", + "logging/event_logger.cc", + "logging/log_buffer.cc", + "memory/arena.cc", + "memory/concurrent_arena.cc", + "memory/jemalloc_nodump_allocator.cc", "memtable/alloc_tracker.cc", "memtable/hash_linklist_rep.cc", "memtable/hash_skiplist_rep.cc", @@ -151,10 +204,12 @@ cpp_library( "memtable/write_buffer_manager.cc", "monitoring/histogram.cc", "monitoring/histogram_windowing.cc", + "monitoring/in_memory_stats_history.cc", "monitoring/instrumented_mutex.cc", "monitoring/iostats_context.cc", "monitoring/perf_context.cc", "monitoring/perf_level.cc", + "monitoring/persistent_stats_history.cc", "monitoring/statistics.cc", "monitoring/thread_status_impl.cc", "monitoring/thread_status_updater.cc", @@ -169,78 +224,70 @@ cpp_library( "options/options_sanity_check.cc", "port/port_posix.cc", "port/stack_trace.cc", - "table/adaptive_table_factory.cc", - "table/block.cc", - "table/block_based_filter_block.cc", - "table/block_based_table_builder.cc", - "table/block_based_table_factory.cc", - "table/block_based_table_reader.cc", - "table/block_builder.cc", + "table/adaptive/adaptive_table_factory.cc", + "table/block_based/block.cc", + "table/block_based/block_based_filter_block.cc", + "table/block_based/block_based_table_builder.cc", + "table/block_based/block_based_table_factory.cc", + "table/block_based/block_based_table_reader.cc", + "table/block_based/block_builder.cc", + "table/block_based/block_prefix_index.cc", + "table/block_based/data_block_footer.cc", + "table/block_based/data_block_hash_index.cc", + "table/block_based/filter_block_reader_common.cc", + "table/block_based/filter_policy.cc", + "table/block_based/flush_block_policy.cc", + "table/block_based/full_filter_block.cc", + "table/block_based/index_builder.cc", + "table/block_based/parsed_full_filter_block.cc", + "table/block_based/partitioned_filter_block.cc", + "table/block_based/uncompression_dict_reader.cc", "table/block_fetcher.cc", - "table/block_prefix_index.cc", - "table/bloom_block.cc", - "table/cuckoo_table_builder.cc", - "table/cuckoo_table_factory.cc", - "table/cuckoo_table_reader.cc", - "table/data_block_footer.cc", - "table/data_block_hash_index.cc", - "table/flush_block_policy.cc", + "table/cuckoo/cuckoo_table_builder.cc", + "table/cuckoo/cuckoo_table_factory.cc", + "table/cuckoo/cuckoo_table_reader.cc", "table/format.cc", - "table/full_filter_block.cc", "table/get_context.cc", - "table/index_builder.cc", "table/iterator.cc", "table/merging_iterator.cc", "table/meta_blocks.cc", - "table/partitioned_filter_block.cc", "table/persistent_cache_helper.cc", - "table/plain_table_builder.cc", - "table/plain_table_factory.cc", - "table/plain_table_index.cc", - "table/plain_table_key_coding.cc", - "table/plain_table_reader.cc", + "table/plain/plain_table_bloom.cc", + "table/plain/plain_table_builder.cc", + "table/plain/plain_table_factory.cc", + "table/plain/plain_table_index.cc", + "table/plain/plain_table_key_coding.cc", + "table/plain/plain_table_reader.cc", "table/sst_file_reader.cc", "table/sst_file_writer.cc", "table/table_properties.cc", "table/two_level_iterator.cc", + "test_util/sync_point.cc", + "test_util/sync_point_impl.cc", + "test_util/transaction_test_util.cc", "tools/dump/db_dump_tool.cc", "tools/ldb_cmd.cc", "tools/ldb_tool.cc", "tools/sst_dump_tool.cc", - "util/arena.cc", - "util/auto_roll_logger.cc", - "util/bloom.cc", + "trace_replay/block_cache_tracer.cc", + "trace_replay/trace_replay.cc", "util/build_version.cc", "util/coding.cc", "util/compaction_job_stats_impl.cc", "util/comparator.cc", "util/compression_context_cache.cc", - "util/concurrent_arena.cc", "util/concurrent_task_limiter_impl.cc", "util/crc32c.cc", - "util/delete_scheduler.cc", "util/dynamic_bloom.cc", - "util/event_logger.cc", - "util/file_reader_writer.cc", - "util/file_util.cc", - "util/filename.cc", - "util/filter_policy.cc", "util/hash.cc", - "util/jemalloc_nodump_allocator.cc", - "util/log_buffer.cc", "util/murmurhash.cc", "util/random.cc", "util/rate_limiter.cc", "util/slice.cc", - "util/sst_file_manager_impl.cc", "util/status.cc", "util/string_util.cc", - "util/sync_point.cc", - "util/sync_point_impl.cc", "util/thread_local.cc", "util/threadpool_imp.cc", - "util/trace_replay.cc", - "util/transaction_test_util.cc", "util/xxhash.cc", "utilities/backupable/backupable_db.cc", "utilities/blob_db/blob_compaction_filter.cc", @@ -266,9 +313,11 @@ cpp_library( "utilities/merge_operators/bytesxor.cc", "utilities/merge_operators/max.cc", "utilities/merge_operators/put.cc", + "utilities/merge_operators/sortlist.cc", "utilities/merge_operators/string_append/stringappend.cc", "utilities/merge_operators/string_append/stringappend2.cc", "utilities/merge_operators/uint64add.cc", + "utilities/object_registry.cc", "utilities/option_change_migration/option_change_migration.cc", "utilities/options/options_util.cc", "utilities/persistent_cache/block_cache_tier.cc", @@ -276,6 +325,7 @@ cpp_library( "utilities/persistent_cache/block_cache_tier_metadata.cc", "utilities/persistent_cache/persistent_cache_tier.cc", "utilities/persistent_cache/volatile_tier_impl.cc", + "utilities/simulator_cache/cache_simulator.cc", "utilities/simulator_cache/sim_cache.cc", "utilities/table_properties_collectors/compact_on_deletion_collector.cc", "utilities/trace/file_trace_reader_writer.cc", @@ -299,6 +349,8 @@ cpp_library( auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -309,15 +361,18 @@ cpp_library( srcs = [ "db/db_test_util.cc", "table/mock_table.cc", + "test_util/fault_injection_test_env.cc", + "test_util/testharness.cc", + "test_util/testutil.cc", + "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", "tools/trace_analyzer_tool.cc", - "util/fault_injection_test_env.cc", - "util/testharness.cc", - "util/testutil.cc", "utilities/cassandra/test_utils.cc", ], auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [":rocksdb_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -326,13 +381,34 @@ cpp_library( cpp_library( name = "rocksdb_tools_lib", srcs = [ + "test_util/testutil.cc", + "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", "tools/db_bench_tool.cc", "tools/trace_analyzer_tool.cc", - "util/testutil.cc", ], auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, + preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, + deps = [":rocksdb_lib"], + external_deps = ROCKSDB_EXTERNAL_DEPS, +) + +cpp_library( + name = "rocksdb_stress_lib", + srcs = [ + "test_util/testutil.cc", + "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", + "tools/db_stress_tool.cc", + "tools/trace_analyzer_tool.cc", + ], + auto_headers = AutoHeaders.RECURSIVE_GLOB, + arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, + compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [":rocksdb_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -344,722 +420,1057 @@ cpp_library( auto_headers = AutoHeaders.RECURSIVE_GLOB, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [":rocksdb_test_lib"], external_deps = ROCKSDB_EXTERNAL_DEPS, ) -# [test_name, test_src, test_type] +# [test_name, test_src, test_type, extra_deps, extra_compiler_flags] ROCKS_TESTS = [ [ "arena_test", - "util/arena_test.cc", + "memory/arena_test.cc", "serial", + [], + [], ], [ "auto_roll_logger_test", - "util/auto_roll_logger_test.cc", + "logging/auto_roll_logger_test.cc", "serial", + [], + [], ], [ "autovector_test", "util/autovector_test.cc", "serial", + [], + [], ], [ "backupable_db_test", "utilities/backupable/backupable_db_test.cc", "parallel", + [], + [], ], [ "blob_db_test", "utilities/blob_db/blob_db_test.cc", "serial", + [], + [], ], [ "block_based_filter_block_test", - "table/block_based_filter_block_test.cc", + "table/block_based/block_based_filter_block_test.cc", "serial", + [], + [], + ], + [ + "block_cache_trace_analyzer_test", + "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc", + "serial", + [], + [], + ], + [ + "block_cache_tracer_test", + "trace_replay/block_cache_tracer_test.cc", + "serial", + [], + [], ], [ "block_test", - "table/block_test.cc", + "table/block_based/block_test.cc", "serial", + [], + [], ], [ "bloom_test", "util/bloom_test.cc", "serial", + [], + [], ], [ "c_test", "db/c_test.c", "serial", + [], + [], + ], + [ + "cache_simulator_test", + "utilities/simulator_cache/cache_simulator_test.cc", + "serial", + [], + [], ], [ "cache_test", "cache/cache_test.cc", "serial", + [], + [], ], [ "cassandra_format_test", "utilities/cassandra/cassandra_format_test.cc", "serial", + [], + [], ], [ "cassandra_functional_test", "utilities/cassandra/cassandra_functional_test.cc", "serial", + [], + [], ], [ "cassandra_row_merge_test", "utilities/cassandra/cassandra_row_merge_test.cc", "serial", + [], + [], ], [ "cassandra_serialize_test", "utilities/cassandra/cassandra_serialize_test.cc", "serial", + [], + [], ], [ "checkpoint_test", "utilities/checkpoint/checkpoint_test.cc", "serial", + [], + [], ], [ "cleanable_test", "table/cleanable_test.cc", "serial", + [], + [], ], [ "coding_test", "util/coding_test.cc", "serial", + [], + [], ], [ "column_family_test", "db/column_family_test.cc", "serial", + [], + [], ], [ "compact_files_test", "db/compact_files_test.cc", "serial", + [], + [], ], [ "compact_on_deletion_collector_test", "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc", "serial", + [], + [], ], [ "compaction_iterator_test", - "db/compaction_iterator_test.cc", + "db/compaction/compaction_iterator_test.cc", "serial", + [], + [], ], [ "compaction_job_stats_test", - "db/compaction_job_stats_test.cc", + "db/compaction/compaction_job_stats_test.cc", "serial", + [], + [], ], [ "compaction_job_test", - "db/compaction_job_test.cc", + "db/compaction/compaction_job_test.cc", "serial", + [], + [], ], [ "compaction_picker_test", - "db/compaction_picker_test.cc", + "db/compaction/compaction_picker_test.cc", "serial", + [], + [], ], [ "comparator_db_test", "db/comparator_db_test.cc", "serial", + [], + [], ], [ "corruption_test", "db/corruption_test.cc", "serial", + [], + [], ], [ "crc32c_test", "util/crc32c_test.cc", "serial", + [], + [], ], [ "cuckoo_table_builder_test", - "table/cuckoo_table_builder_test.cc", + "table/cuckoo/cuckoo_table_builder_test.cc", "serial", + [], + [], ], [ "cuckoo_table_db_test", "db/cuckoo_table_db_test.cc", "serial", + [], + [], ], [ "cuckoo_table_reader_test", - "table/cuckoo_table_reader_test.cc", + "table/cuckoo/cuckoo_table_reader_test.cc", "serial", + [], + [], ], [ "data_block_hash_index_test", - "table/data_block_hash_index_test.cc", + "table/block_based/data_block_hash_index_test.cc", "serial", + [], + [], ], [ "db_basic_test", "db/db_basic_test.cc", "serial", + [], + [], ], [ "db_blob_index_test", "db/db_blob_index_test.cc", "serial", + [], + [], ], [ "db_block_cache_test", "db/db_block_cache_test.cc", "serial", + [], + [], ], [ "db_bloom_filter_test", "db/db_bloom_filter_test.cc", - "serial", + "parallel", + [], + [], ], [ "db_compaction_filter_test", "db/db_compaction_filter_test.cc", "parallel", + [], + [], ], [ "db_compaction_test", "db/db_compaction_test.cc", "parallel", + [], + [], ], [ "db_dynamic_level_test", "db/db_dynamic_level_test.cc", "serial", + [], + [], ], [ "db_encryption_test", "db/db_encryption_test.cc", "serial", + [], + [], ], [ "db_flush_test", "db/db_flush_test.cc", "serial", + [], + [], ], [ "db_inplace_update_test", "db/db_inplace_update_test.cc", "serial", + [], + [], ], [ "db_io_failure_test", "db/db_io_failure_test.cc", "serial", + [], + [], ], [ "db_iter_stress_test", "db/db_iter_stress_test.cc", "serial", + [], + [], ], [ "db_iter_test", "db/db_iter_test.cc", "serial", + [], + [], ], [ "db_iterator_test", "db/db_iterator_test.cc", "serial", + [], + [], ], [ "db_log_iter_test", "db/db_log_iter_test.cc", "serial", + [], + [], ], [ "db_memtable_test", "db/db_memtable_test.cc", "serial", + [], + [], + ], + [ + "db_merge_operand_test", + "db/db_merge_operand_test.cc", + "serial", + [], + [], ], [ "db_merge_operator_test", "db/db_merge_operator_test.cc", "parallel", + [], + [], ], [ "db_options_test", "db/db_options_test.cc", "serial", + [], + [], ], [ "db_properties_test", "db/db_properties_test.cc", "serial", + [], + [], ], [ "db_range_del_test", "db/db_range_del_test.cc", "serial", + [], + [], ], [ "db_secondary_test", - "db/db_secondary_test.cc", + "db/db_impl/db_secondary_test.cc", "serial", + [], + [], ], [ "db_sst_test", "db/db_sst_test.cc", "parallel", + [], + [], ], [ "db_statistics_test", "db/db_statistics_test.cc", "serial", + [], + [], ], [ "db_table_properties_test", "db/db_table_properties_test.cc", "serial", + [], + [], ], [ "db_tailing_iter_test", "db/db_tailing_iter_test.cc", "serial", + [], + [], ], [ "db_test", "db/db_test.cc", "parallel", + [], + [], ], [ "db_test2", "db/db_test2.cc", "serial", + [], + [], ], [ "db_universal_compaction_test", "db/db_universal_compaction_test.cc", "parallel", + [], + [], ], [ "db_wal_test", "db/db_wal_test.cc", "parallel", + [], + [], ], [ "db_write_test", "db/db_write_test.cc", "serial", + [], + [], ], [ "dbformat_test", "db/dbformat_test.cc", "serial", + [], + [], ], [ "delete_scheduler_test", - "util/delete_scheduler_test.cc", + "file/delete_scheduler_test.cc", "serial", + [], + [], ], [ "deletefile_test", "db/deletefile_test.cc", "serial", + [], + [], ], [ "dynamic_bloom_test", "util/dynamic_bloom_test.cc", "serial", + [], + [], ], [ "env_basic_test", "env/env_basic_test.cc", "serial", + [], + [], + ], + [ + "env_logger_test", + "logging/env_logger_test.cc", + "serial", + [], + [], ], [ "env_test", "env/env_test.cc", "serial", + [], + [], ], [ "env_timed_test", "utilities/env_timed_test.cc", "serial", + [], + [], ], [ "error_handler_test", "db/error_handler_test.cc", "serial", + [], + [], ], [ "event_logger_test", - "util/event_logger_test.cc", + "logging/event_logger_test.cc", "serial", + [], + [], ], [ "external_sst_file_basic_test", "db/external_sst_file_basic_test.cc", "serial", + [], + [], ], [ "external_sst_file_test", "db/external_sst_file_test.cc", "parallel", + [], + [], ], [ "fault_injection_test", "db/fault_injection_test.cc", "parallel", + [], + [], ], [ "file_indexer_test", "db/file_indexer_test.cc", "serial", + [], + [], ], [ "file_reader_writer_test", "util/file_reader_writer_test.cc", - "serial", + "parallel", + [], + [], ], [ "filelock_test", "util/filelock_test.cc", "serial", + [], + [], ], [ "filename_test", "db/filename_test.cc", "serial", + [], + [], ], [ "flush_job_test", "db/flush_job_test.cc", "serial", + [], + [], ], [ "full_filter_block_test", - "table/full_filter_block_test.cc", + "table/block_based/full_filter_block_test.cc", "serial", + [], + [], ], [ "hash_table_test", "utilities/persistent_cache/hash_table_test.cc", "serial", + [], + [], ], [ "hash_test", "util/hash_test.cc", "serial", + [], + [], ], [ "heap_test", "util/heap_test.cc", "serial", + [], + [], ], [ "histogram_test", "monitoring/histogram_test.cc", "serial", + [], + [], + ], + [ + "import_column_family_test", + "db/import_column_family_test.cc", + "parallel", + [], + [], ], [ "inlineskiplist_test", "memtable/inlineskiplist_test.cc", "parallel", + [], + [], ], [ "iostats_context_test", "monitoring/iostats_context_test.cc", "serial", + [], + [], ], [ "ldb_cmd_test", "tools/ldb_cmd_test.cc", "serial", + [], + [], ], [ "listener_test", "db/listener_test.cc", "serial", + [], + [], ], [ "log_test", "db/log_test.cc", "serial", + [], + [], ], [ "lru_cache_test", "cache/lru_cache_test.cc", "serial", + [], + [], ], [ "manual_compaction_test", "db/manual_compaction_test.cc", "parallel", + [], + [], ], [ "memory_test", "utilities/memory/memory_test.cc", "serial", + [], + [], ], [ "memtable_list_test", "db/memtable_list_test.cc", "serial", + [], + [], ], [ "merge_helper_test", "db/merge_helper_test.cc", "serial", + [], + [], ], [ "merge_test", "db/merge_test.cc", "serial", + [], + [], ], [ "merger_test", "table/merger_test.cc", "serial", + [], + [], ], [ "mock_env_test", "env/mock_env_test.cc", "serial", + [], + [], ], [ "object_registry_test", "utilities/object_registry_test.cc", "serial", + [], + [], ], [ "obsolete_files_test", "db/obsolete_files_test.cc", "serial", + [], + [], ], [ "optimistic_transaction_test", "utilities/transactions/optimistic_transaction_test.cc", "serial", + [], + [], ], [ "option_change_migration_test", "utilities/option_change_migration/option_change_migration_test.cc", "serial", + [], + [], ], [ "options_file_test", "db/options_file_test.cc", "serial", + [], + [], ], [ "options_settable_test", "options/options_settable_test.cc", "serial", + [], + [], ], [ "options_test", "options/options_test.cc", "serial", + [], + [], ], [ "options_util_test", "utilities/options/options_util_test.cc", "serial", + [], + [], ], [ "partitioned_filter_block_test", - "table/partitioned_filter_block_test.cc", + "table/block_based/partitioned_filter_block_test.cc", "serial", + [], + [], ], [ "perf_context_test", "db/perf_context_test.cc", "serial", + [], + [], ], [ "persistent_cache_test", "utilities/persistent_cache/persistent_cache_test.cc", "parallel", + [], + [], ], [ "plain_table_db_test", "db/plain_table_db_test.cc", "serial", + [], + [], ], [ "prefix_test", "db/prefix_test.cc", "serial", + [], + [], ], [ "range_del_aggregator_test", "db/range_del_aggregator_test.cc", "serial", + [], + [], ], [ "range_tombstone_fragmenter_test", "db/range_tombstone_fragmenter_test.cc", "serial", + [], + [], ], [ "rate_limiter_test", "util/rate_limiter_test.cc", "serial", + [], + [], ], [ "reduce_levels_test", "tools/reduce_levels_test.cc", "serial", + [], + [], ], [ "repair_test", "db/repair_test.cc", "serial", + [], + [], ], [ "repeatable_thread_test", "util/repeatable_thread_test.cc", "serial", + [], + [], ], [ "sim_cache_test", "utilities/simulator_cache/sim_cache_test.cc", "serial", + [], + [], ], [ "skiplist_test", "memtable/skiplist_test.cc", "serial", + [], + [], ], [ "slice_transform_test", "util/slice_transform_test.cc", "serial", + [], + [], ], [ "sst_dump_test", "tools/sst_dump_test.cc", "serial", + [], + [], ], [ "sst_file_reader_test", "table/sst_file_reader_test.cc", "serial", + [], + [], ], [ "statistics_test", "monitoring/statistics_test.cc", "serial", + [], + [], + ], + [ + "stats_history_test", + "monitoring/stats_history_test.cc", + "serial", + [], + [], ], [ "stringappend_test", "utilities/merge_operators/string_append/stringappend_test.cc", "serial", + [], + [], ], [ "table_properties_collector_test", "db/table_properties_collector_test.cc", "serial", + [], + [], ], [ "table_test", "table/table_test.cc", "parallel", + [], + [], ], [ "thread_list_test", "util/thread_list_test.cc", "serial", + [], + [], ], [ "thread_local_test", "util/thread_local_test.cc", "serial", + [], + [], ], [ "timer_queue_test", "util/timer_queue_test.cc", "serial", + [], + [], ], [ "trace_analyzer_test", "tools/trace_analyzer_test.cc", "serial", + [], + [], ], [ "transaction_test", "utilities/transactions/transaction_test.cc", "parallel", + [], + [], ], [ "ttl_test", "utilities/ttl/ttl_test.cc", "serial", + [], + [], ], [ "util_merge_operators_test", "utilities/util_merge_operators_test.cc", "serial", + [], + [], ], [ "version_builder_test", "db/version_builder_test.cc", "serial", + [], + [], ], [ "version_edit_test", "db/version_edit_test.cc", "serial", + [], + [], ], [ "version_set_test", "db/version_set_test.cc", "serial", + [], + [], ], [ "wal_manager_test", "db/wal_manager_test.cc", "serial", + [], + [], ], [ "write_batch_test", "db/write_batch_test.cc", "serial", + [], + [], ], [ "write_batch_with_index_test", "utilities/write_batch_with_index/write_batch_with_index_test.cc", "serial", + [], + [], ], [ "write_buffer_manager_test", "memtable/write_buffer_manager_test.cc", "serial", + [], + [], ], [ "write_callback_test", "db/write_callback_test.cc", "serial", + [], + [], ], [ "write_controller_test", "db/write_controller_test.cc", "serial", + [], + [], ], [ "write_prepared_transaction_test", "utilities/transactions/write_prepared_transaction_test.cc", "parallel", + [], + [], ], [ "write_unprepared_transaction_test", "utilities/transactions/write_unprepared_transaction_test.cc", "parallel", + [], + [], ], ] @@ -1068,14 +1479,18 @@ ROCKS_TESTS = [ # will not be included. [ test_binary( + extra_compiler_flags = extra_compiler_flags, + extra_deps = extra_deps, parallelism = parallelism, rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS, rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS, + rocksdb_os_deps = ROCKSDB_OS_DEPS, + rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, test_cc = test_cc, test_name = test_name, ) - for test_name, test_cc, parallelism in ROCKS_TESTS + for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS if not is_opt_mode ] diff --git a/USERS.md b/USERS.md index a95903f0662..0fc030339d8 100644 --- a/USERS.md +++ b/USERS.md @@ -50,6 +50,10 @@ Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santande ## Airbnb Airbnb is using RocksDB as a storage engine for their personalized search service. You can learn more about it here: https://www.youtube.com/watch?v=ASQ6XMtogMs +## Alluxio +[Alluxio](https://www.alluxio.io) uses RocksDB to serve and scale file system metadata to beyond 1 Billion files. The detailed design and implementation is described in this engineering blog: +https://www.alluxio.io/blog/scalable-metadata-service-in-alluxio-storing-billions-of-files/ + ## Pinterest Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtube.com/watch?v=MtFEVEs_2Vo @@ -91,4 +95,10 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed [ProfaneDB](https://profanedb.gitlab.io/) is a database for Protocol Buffers, and uses RocksDB for storage. It is accessible via gRPC, and the schema is defined using directly `.proto` files. ## IOTA Foundation - [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things. \ No newline at end of file + [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things. + +## Avrio Project + [Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions. + +## Crux +[Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability. diff --git a/appveyor.yml b/appveyor.yml index 9dae40af8f7..77901c40724 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,15 +1,67 @@ version: 1.0.{build} + image: Visual Studio 2017 + +environment: + JAVA_HOME: C:\Program Files\Java\jdk1.8.0 + THIRDPARTY_HOME: $(APPVEYOR_BUILD_FOLDER)\thirdparty + SNAPPY_HOME: $(THIRDPARTY_HOME)\snappy-1.1.7 + SNAPPY_INCLUDE: $(SNAPPY_HOME);$(SNAPPY_HOME)\build + SNAPPY_LIB_DEBUG: $(SNAPPY_HOME)\build\Debug\snappy.lib + SNAPPY_LIB_RELEASE: $(SNAPPY_HOME)\build\Release\snappy.lib + LZ4_HOME: $(THIRDPARTY_HOME)\lz4-1.8.3 + LZ4_INCLUDE: $(LZ4_HOME)\lib + LZ4_LIB_DEBUG: $(LZ4_HOME)\visual\VS2010\bin\x64_Debug\liblz4_static.lib + LZ4_LIB_RELEASE: $(LZ4_HOME)\visual\VS2010\bin\x64_Release\liblz4_static.lib + ZSTD_HOME: $(THIRDPARTY_HOME)\zstd-1.4.0 + ZSTD_INCLUDE: $(ZSTD_HOME)\lib;$(ZSTD_HOME)\lib\dictBuilder + ZSTD_LIB_DEBUG: $(ZSTD_HOME)\build\VS2010\bin\x64_Debug\libzstd_static.lib + ZSTD_LIB_RELEASE: $(ZSTD_HOME)\build\VS2010\bin\x64_Release\libzstd_static.lib + +install: + - md %THIRDPARTY_HOME% + - echo "Building Snappy dependency..." + - cd %THIRDPARTY_HOME% + - curl -fsSL -o snappy-1.1.7.zip https://github.com/google/snappy/archive/1.1.7.zip + - unzip snappy-1.1.7.zip + - cd snappy-1.1.7 + - mkdir build + - cd build + - cmake -DCMAKE_GENERATOR_PLATFORM=x64 .. + - msbuild Snappy.sln /p:Configuration=Debug /p:Platform=x64 + - msbuild Snappy.sln /p:Configuration=Release /p:Platform=x64 + - echo "Building LZ4 dependency..." + - cd %THIRDPARTY_HOME% + - curl -fsSL -o lz4-1.8.3.zip https://github.com/lz4/lz4/archive/v1.8.3.zip + - unzip lz4-1.8.3.zip + - cd lz4-1.8.3\visual\VS2010 + - ps: $CMD="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com"; & $CMD lz4.sln /upgrade + - msbuild lz4.sln /p:Configuration=Debug /p:Platform=x64 + - msbuild lz4.sln /p:Configuration=Release /p:Platform=x64 + - echo "Building ZStd dependency..." + - cd %THIRDPARTY_HOME% + - curl -fsSL -o zstd-1.4.0.zip https://github.com/facebook/zstd/archive/v1.4.0.zip + - unzip zstd-1.4.0.zip + - cd zstd-1.4.0\build\VS2010 + - ps: $CMD="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com"; & $CMD zstd.sln /upgrade + - msbuild zstd.sln /p:Configuration=Debug /p:Platform=x64 + - msbuild zstd.sln /p:Configuration=Release /p:Platform=x64 + before_build: -- md %APPVEYOR_BUILD_FOLDER%\build -- cd %APPVEYOR_BUILD_FOLDER%\build -- cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DWITH_XPRESS=1 -DPORTABLE=1 -DJNI=1 .. -- cd .. + - md %APPVEYOR_BUILD_FOLDER%\build + - cd %APPVEYOR_BUILD_FOLDER%\build + - cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 .. + - cd .. build: project: build\rocksdb.sln parallel: true verbosity: normal + test: + test_script: -- ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8 + - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test,db_merge_operand_test -Concurrency 8 + +on_failure: + - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index a5d71b65d4e..d2bba5940cc 100644 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -3,13 +3,36 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals +try: + from builtins import str +except ImportError: + from __builtin__ import str from targets_builder import TARGETSBuilder +import json import os import fnmatch import sys from util import ColorString +# This script generates TARGETS file for Buck. +# Buck is a build tool specifying dependencies among different build targets. +# User can pass extra dependencies as a JSON object via command line, and this +# script can include these dependencies in the generate TARGETS file. +# Usage: +# $python buckifier/buckify_rocksdb.py +# (This generates a TARGET file without user-specified dependency for unit +# tests.) +# $python buckifier/buckify_rocksdb.py \ +# '{"fake": { \ +# "extra_deps": [":test_dep", "//fakes/module:mock1"], \ +# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"], \ +# } \ +# }' +# (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB +# unit tests, and will use the extra_compiler_flags to compile the unit test +# source.) + # tests to export as libraries for inclusion in other projects _EXPORTED_TEST_LIBS = ["env_basic_test"] @@ -86,8 +109,32 @@ def get_tests(repo_path): return tests +# Parse extra dependencies passed by user from command line +def get_dependencies(): + deps_map = { + '': { + 'extra_deps': [], + 'extra_compiler_flags': [] + } + } + if len(sys.argv) < 2: + return deps_map + + def encode_dict(data): + rv = {} + for k, v in data.items(): + if isinstance(v, dict): + v = encode_dict(v) + rv[k] = v + return rv + extra_deps = json.loads(sys.argv[1], object_hook=encode_dict) + for target_alias, deps in extra_deps.items(): + deps_map[target_alias] = deps + return deps_map + + # Prepare TARGETS file for buck -def generate_targets(repo_path): +def generate_targets(repo_path, deps_map): print(ColorString.info("Generating TARGETS")) # parsed src.mk file src_mk = parse_src_mk(repo_path) @@ -118,27 +165,43 @@ def generate_targets(repo_path): "rocksdb_tools_lib", src_mk.get("BENCH_LIB_SOURCES", []) + src_mk.get("ANALYZER_LIB_SOURCES", []) + - ["util/testutil.cc"], + ["test_util/testutil.cc"], + [":rocksdb_lib"]) + # rocksdb_stress_lib + TARGETS.add_library( + "rocksdb_stress_lib", + src_mk.get("ANALYZER_LIB_SOURCES", []) + + src_mk.get('STRESS_LIB_SOURCES', []) + + ["test_util/testutil.cc"], [":rocksdb_lib"]) + print("Extra dependencies:\n{0}".format(str(deps_map))) # test for every test we found in the Makefile - for test in sorted(tests): - match_src = [src for src in cc_files if ("/%s.c" % test) in src] - if len(match_src) == 0: - print(ColorString.warning("Cannot find .cc file for %s" % test)) - continue - elif len(match_src) > 1: - print(ColorString.warning("Found more than one .cc for %s" % test)) - print(match_src) - continue - - assert(len(match_src) == 1) - is_parallel = tests[test] - TARGETS.register_test(test, match_src[0], is_parallel) - - if test in _EXPORTED_TEST_LIBS: - test_library = "%s_lib" % test - TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"]) + for target_alias, deps in deps_map.items(): + for test in sorted(tests): + match_src = [src for src in cc_files if ("/%s.c" % test) in src] + if len(match_src) == 0: + print(ColorString.warning("Cannot find .cc file for %s" % test)) + continue + elif len(match_src) > 1: + print(ColorString.warning("Found more than one .cc for %s" % test)) + print(match_src) + continue + + assert(len(match_src) == 1) + is_parallel = tests[test] + test_target_name = \ + test if not target_alias else test + "_" + target_alias + TARGETS.register_test( + test_target_name, + match_src[0], + is_parallel, + deps['extra_deps'], + deps['extra_compiler_flags']) + + if test in _EXPORTED_TEST_LIBS: + test_library = "%s_lib" % test_target_name + TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"]) TARGETS.flush_tests() print(ColorString.info("Generated TARGETS Summary:")) @@ -163,8 +226,9 @@ def exit_with_error(msg): def main(): + deps_map = get_dependencies() # Generate TARGETS file for buck - ok = generate_targets(get_rocksdb_path()) + ok = generate_targets(get_rocksdb_path(), deps_map) if not ok: exit_with_error("Failed to generate TARGETS files") diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py index 493cd8a8a8a..ba90bc612da 100644 --- a/buckifier/targets_builder.py +++ b/buckifier/targets_builder.py @@ -3,6 +3,12 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals +try: + from builtins import object + from builtins import str +except ImportError: + from __builtin__ import object + from __builtin__ import str import targets_cfg def pretty_list(lst, indent=8): @@ -18,7 +24,7 @@ def pretty_list(lst, indent=8): return res -class TARGETSBuilder: +class TARGETSBuilder(object): def __init__(self, path): self.path = path self.targets_file = open(path, 'w') @@ -51,14 +57,21 @@ def add_binary(self, name, srcs, deps=None): pretty_list(deps))) self.total_bin = self.total_bin + 1 - def register_test(self, test_name, src, is_parallel): + def register_test(self, + test_name, + src, + is_parallel, + extra_deps, + extra_compiler_flags): exec_mode = "serial" if is_parallel: exec_mode = "parallel" self.tests_cfg += targets_cfg.test_cfg_template % ( test_name, str(src), - str(exec_mode)) + str(exec_mode), + extra_deps, + extra_compiler_flags) self.total_test = self.total_test + 1 diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 730b5ebf9da..0ecd6fdda76 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -3,7 +3,13 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals -rocksdb_target_header = """load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders") + +rocksdb_target_header = """# This file \100generated by `python buckifier/buckify_rocksdb.py` +# --> DO NOT EDIT MANUALLY <-- +# This file is a Facebook-specific integration for buck builds, so can +# only be validated by Facebook employees. +# +load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders") load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") load(":defs.bzl", "test_binary") @@ -11,30 +17,11 @@ ROCKSDB_COMPILER_FLAGS = [ "-fno-builtin-memcmp", - "-DROCKSDB_PLATFORM_POSIX", - "-DROCKSDB_LIB_IO_POSIX", - "-DROCKSDB_FALLOCATE_PRESENT", - "-DROCKSDB_MALLOC_USABLE_SIZE", - "-DROCKSDB_RANGESYNC_PRESENT", - "-DROCKSDB_SCHED_GETCPU_PRESENT", - "-DROCKSDB_SUPPORT_THREAD_LOCAL", - "-DOS_LINUX", - # Flags to enable libs we include - "-DSNAPPY", - "-DZLIB", - "-DBZIP2", - "-DLZ4", - "-DZSTD", - "-DZSTD_STATIC_LINKING_ONLY", - "-DGFLAGS=gflags", - "-DNUMA", - "-DTBB", # Needed to compile in fbcode "-Wno-expansion-to-defined", # Added missing flags from output of build_detect_platform - "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", - "-DROCKSDB_BACKTRACE", "-Wnarrowing", + "-DROCKSDB_NO_DYNAMIC_EXTENSION", ] ROCKSDB_EXTERNAL_DEPS = [ @@ -45,11 +32,54 @@ ("lz4", None, "lz4"), ("zstd", None), ("tbb", None), - ("numa", None, "numa"), ("googletest", None, "gtest"), ] +ROCKSDB_OS_DEPS = [ + ( + "linux", + ["third-party//numa:numa"], + ), +] + +ROCKSDB_OS_PREPROCESSOR_FLAGS = [ + ( + "linux", + [ + "-DOS_LINUX", + "-DROCKSDB_FALLOCATE_PRESENT", + "-DROCKSDB_MALLOC_USABLE_SIZE", + "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX", + "-DROCKSDB_RANGESYNC_PRESENT", + "-DROCKSDB_SCHED_GETCPU_PRESENT", + "-DHAVE_SSE42", + "-DNUMA", + ], + ), + ( + "macos", + ["-DOS_MACOSX"], + ), +] + ROCKSDB_PREPROCESSOR_FLAGS = [ + "-DROCKSDB_PLATFORM_POSIX", + "-DROCKSDB_LIB_IO_POSIX", + "-DROCKSDB_SUPPORT_THREAD_LOCAL", + + # Flags to enable libs we include + "-DSNAPPY", + "-DZLIB", + "-DBZIP2", + "-DLZ4", + "-DZSTD", + "-DZSTD_STATIC_LINKING_ONLY", + "-DGFLAGS=gflags", + "-DTBB", + + # Added missing flags from output of build_detect_platform + "-DROCKSDB_BACKTRACE", + # Directories with files for #include "-I" + REPO_PATH + "include/", "-I" + REPO_PATH, @@ -57,7 +87,6 @@ ROCKSDB_ARCH_PREPROCESSOR_FLAGS = { "x86_64": [ - "-DHAVE_SSE42", "-DHAVE_PCLMUL", ], } @@ -74,9 +103,15 @@ # Do not enable jemalloc if sanitizer presents. RocksDB will further detect # whether the binary is linked with jemalloc at runtime. -ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else []) - -ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else []) +ROCKSDB_OS_PREPROCESSOR_FLAGS += ([( + "linux", + ["-DROCKSDB_JEMALLOC"], +)] if sanitizer == "" else []) + +ROCKSDB_OS_DEPS += ([( + "linux", + ["third-party//jemalloc:headers"], +)] if sanitizer == "" else []) """ @@ -87,6 +122,8 @@ {headers_attr_prefix}headers = {headers}, arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, compiler_flags = ROCKSDB_COMPILER_FLAGS, + os_deps = ROCKSDB_OS_DEPS, + os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, deps = [{deps}], external_deps = ROCKSDB_EXTERNAL_DEPS, @@ -109,11 +146,13 @@ "%s", "%s", "%s", + %s, + %s, ], """ unittests_template = """ -# [test_name, test_src, test_type] +# [test_name, test_src, test_type, extra_deps, extra_compiler_flags] ROCKS_TESTS = [ %s] @@ -122,15 +161,19 @@ # will not be included. [ test_binary( + extra_compiler_flags = extra_compiler_flags, + extra_deps = extra_deps, parallelism = parallelism, rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS, rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS, rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS, + rocksdb_os_deps = ROCKSDB_OS_DEPS, + rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS, rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS, test_cc = test_cc, test_name = test_name, ) - for test_name, test_cc, parallelism in ROCKS_TESTS + for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS if not is_opt_mode ] """ diff --git a/buckifier/util.py b/buckifier/util.py index 2eda69f1075..f04929a277c 100644 --- a/buckifier/util.py +++ b/buckifier/util.py @@ -6,11 +6,16 @@ from __future__ import division from __future__ import print_function from __future__ import unicode_literals +try: + from builtins import object +except ImportError: + from __builtin__ import object import subprocess +import sys import os import time -class ColorString: +class ColorString(object): """ Generate colorful strings on terminal """ HEADER = '\033[95m' BLUE = '\033[94m' @@ -21,7 +26,13 @@ class ColorString: @staticmethod def _make_color_str(text, color): - return "".join([color, text.encode('utf-8'), ColorString.ENDC]) + # In Python2, default encoding for unicode string is ASCII + if sys.version_info.major <= 2: + return "".join( + [color, text.encode('utf-8'), ColorString.ENDC]) + # From Python3, default encoding for unicode string is UTF-8 + return "".join( + [color, text, ColorString.ENDC]) @staticmethod def ok(text): diff --git a/build_tools/RocksDBCommonHelper.php b/build_tools/RocksDBCommonHelper.php deleted file mode 100644 index e7bfb520347..00000000000 --- a/build_tools/RocksDBCommonHelper.php +++ /dev/null @@ -1,377 +0,0 @@ - 0); - assert(is_numeric($diffID)); - assert(strlen($url) > 0); - - $cmd_args = array( - 'diff_id' => (int)$diffID, - 'name' => sprintf( - 'click here for sandcastle tests for D%d', - (int)$diffID - ), - 'link' => $url - ); - $cmd = 'echo ' . escapeshellarg(json_encode($cmd_args)) - . ' | arc call-conduit differential.updateunitresults'; - - shell_exec($cmd); -} - -function buildUpdateTestStatusCmd($diffID, $test, $status) { - assert(strlen($diffID) > 0); - assert(is_numeric($diffID)); - assert(strlen($test) > 0); - assert(strlen($status) > 0); - - $cmd_args = array( - 'diff_id' => (int)$diffID, - 'name' => $test, - 'result' => $status - ); - - $cmd = 'echo ' . escapeshellarg(json_encode($cmd_args)) - . ' | arc call-conduit differential.updateunitresults'; - - return $cmd; -} - -function updateTestStatus($diffID, $test) { - assert(strlen($diffID) > 0); - assert(is_numeric($diffID)); - assert(strlen($test) > 0); - - shell_exec(buildUpdateTestStatusCmd($diffID, $test, "waiting")); -} - -function getSteps($applyDiff, $diffID, $username, $test) { - assert(strlen($username) > 0); - assert(strlen($test) > 0); - - if ($applyDiff) { - assert(strlen($diffID) > 0); - assert(is_numeric($diffID)); - - $arcrc_content = (PHP_OS == "Darwin" ? - exec("cat ~/.arcrc | gzip -f | base64") : - exec("cat ~/.arcrc | gzip -f | base64 -w0")); - assert(strlen($arcrc_content) > 0); - - // Sandcastle machines don't have arc setup. We copy the user certificate - // and authenticate using that in Sandcastle. - $setup = array( - "name" => "Setup arcrc", - "shell" => "echo " . escapeshellarg($arcrc_content) . " | base64 --decode" - . " | gzip -d > ~/.arcrc", - "user" => "root" - ); - - // arc demands certain permission on its config. - // also fix the sticky bit issue in sandcastle - $fix_permission = array( - "name" => "Fix environment", - "shell" => "chmod 600 ~/.arcrc && chmod +t /dev/shm", - "user" => "root" - ); - - // Construct the steps in the order of execution. - $steps[] = $setup; - $steps[] = $fix_permission; - } - - // fbcode is a sub-repo. We cannot patch until we add it to ignore otherwise - // Git thinks it is an uncommitted change. - $fix_git_ignore = array( - "name" => "Fix git ignore", - "shell" => "echo fbcode >> .git/info/exclude", - "user" => "root" - ); - - // This fixes "FATAL: ThreadSanitizer can not mmap the shadow memory" - // Source: - // https://github.com/google/sanitizers/wiki/ThreadSanitizerCppManual#FAQ - $fix_kernel_issue = array( - "name" => "Fix kernel issue with tsan", - "shell" => "echo 2 >/proc/sys/kernel/randomize_va_space", - "user" => "root" - ); - - $steps[] = $fix_git_ignore; - $steps[] = $fix_kernel_issue; - - // This will be the command used to execute particular type of tests. - $cmd = ""; - - if ($applyDiff) { - // Patch the code (keep your fingures crossed). - $patch = array( - "name" => "Patch " . $diffID, - "shell" => "arc --arcrc-file ~/.arcrc " - . "patch --nocommit --diff " . escapeshellarg($diffID), - "user" => "root" - ); - - $steps[] = $patch; - - updateTestStatus($diffID, $test); - $cmd = buildUpdateTestStatusCmd($diffID, $test, "running") . "; "; - } - - // Run the actual command. - $cmd = $cmd . "J=$(nproc) ./build_tools/precommit_checker.py " . - escapeshellarg($test) . "; exit_code=$?; "; - - if ($applyDiff) { - $cmd = $cmd . "([[ \$exit_code -eq 0 ]] &&" - . buildUpdateTestStatusCmd($diffID, $test, "pass") . ")" - . "||" . buildUpdateTestStatusCmd($diffID, $test, "fail") - . "; "; - } - - // shell command to sort the tests based on exit code and print - // the output of the log files. - $cat_sorted_logs = " - while read code log_file; - do echo \"################ cat \$log_file [exit_code : \$code] ################\"; - cat \$log_file; - done < <(tail -n +2 LOG | sort -k7,7n -k4,4gr | awk '{print \$7,\$NF}')"; - - // Shell command to cat all log files - $cat_all_logs = "for f in `ls t/!(run-*)`; do echo \$f;cat \$f; done"; - - // If LOG file exist use it to cat log files sorted by exit code, otherwise - // cat everything - $logs_cmd = "if [ -f LOG ]; then {$cat_sorted_logs}; else {$cat_all_logs}; fi"; - - $cmd = $cmd . " cat /tmp/precommit-check.log" - . "; shopt -s extglob; {$logs_cmd}" - . "; shopt -u extglob; [[ \$exit_code -eq 0 ]]"; - assert(strlen($cmd) > 0); - - $run_test = array( - "name" => "Run " . $test, - "shell" => $cmd, - "user" => "root", - "parser" => "python build_tools/error_filter.py " . escapeshellarg($test), - ); - - $steps[] = $run_test; - - if ($applyDiff) { - // Clean up the user arc config we are using. - $cleanup = array( - "name" => "Arc cleanup", - "shell" => "rm -f ~/.arcrc", - "user" => "root" - ); - - $steps[] = $cleanup; - } - - assert(count($steps) > 0); - return $steps; -} - -function getSandcastleConfig() { - $sandcastle_config = array(); - - $cwd = getcwd(); - $cwd_token_file = "{$cwd}/.sandcastle"; - // This is a case when we're executed from a continuous run. Fetch the values - // from the environment. - if (getenv(ENV_POST_RECEIVE_HOOK)) { - $sandcastle_config[0] = getenv(ENV_HTTPS_APP_VALUE); - $sandcastle_config[1] = getenv(ENV_HTTPS_TOKEN_VALUE); - } else { - // This is a typical `[p]arc diff` case. Fetch the values from the specific - // configuration files. - for ($i = 0; $i < 50; $i++) { - if (file_exists(PRIMARY_TOKEN_FILE) || - file_exists($cwd_token_file)) { - break; - } - // If we failed to fetch the tokens, sleep for 0.2 second and try again - usleep(200000); - } - assert(file_exists(PRIMARY_TOKEN_FILE) || - file_exists($cwd_token_file)); - - // Try the primary location first, followed by a secondary. - if (file_exists(PRIMARY_TOKEN_FILE)) { - $cmd = 'cat ' . PRIMARY_TOKEN_FILE; - } else { - $cmd = 'cat ' . escapeshellarg($cwd_token_file); - } - - assert(strlen($cmd) > 0); - $sandcastle_config = explode(':', rtrim(shell_exec($cmd))); - } - - // In this case be very explicit about the implications. - if (count($sandcastle_config) != 2) { - echo "Sandcastle configuration files don't contain valid information " . - "or the necessary environment variables aren't defined. Unable " . - "to validate the code changes."; - exit(1); - } - - assert(strlen($sandcastle_config[0]) > 0); - assert(strlen($sandcastle_config[1]) > 0); - assert(count($sandcastle_config) > 0); - - return $sandcastle_config; -} - -// This function can be called either from `[p]arc diff` command or during -// the Git post-receive hook. - function startTestsInSandcastle($applyDiff, $workflow, $diffID) { - // Default options don't terminate on failure, but that's what we want. In - // the current case we use assertions intentionally as "terminate on failure - // invariants". - assert_options(ASSERT_BAIL, true); - - // In case of a diff we'll send notificatios to the author. Else it'll go to - // the entire team because failures indicate that build quality has regressed. - $username = $applyDiff ? exec("whoami") : CONT_RUN_ALIAS; - assert(strlen($username) > 0); - - if ($applyDiff) { - assert($workflow); - assert(strlen($diffID) > 0); - assert(is_numeric($diffID)); - } - - // List of tests we want to run in Sandcastle. - $tests = array("unit", "unit_non_shm", "unit_481", "clang_unit", "tsan", - "asan", "lite_test", "valgrind", "release", "release_481", - "clang_release", "clang_analyze", "code_cov", - "java_build", "no_compression", "unity", "ubsan"); - - $send_email_template = array( - 'type' => 'email', - 'triggers' => array('fail'), - 'emails' => array($username . '@fb.com'), - ); - - // Construct a job definition for each test and add it to the master plan. - foreach ($tests as $test) { - $stepName = "RocksDB diff " . $diffID . " test " . $test; - - if (!$applyDiff) { - $stepName = "RocksDB continuous integration test " . $test; - } - - $arg[] = array( - "name" => $stepName, - "report" => array($send_email_template), - "steps" => getSteps($applyDiff, $diffID, $username, $test) - ); - } - - // We cannot submit the parallel execution master plan to Sandcastle and - // need supply the job plan as a determinator. So we construct a small job - // that will spit out the master job plan which Sandcastle will parse and - // execute. Why compress the job definitions? Otherwise we run over the max - // string size. - $cmd = "echo " . base64_encode(json_encode($arg)) - . (PHP_OS == "Darwin" ? - " | gzip -f | base64" : - " | gzip -f | base64 -w0"); - assert(strlen($cmd) > 0); - - $arg_encoded = shell_exec($cmd); - assert(strlen($arg_encoded) > 0); - - $runName = "Run diff " . $diffID . "for user " . $username; - - if (!$applyDiff) { - $runName = "RocksDB continuous integration build and test run"; - } - - $command = array( - "name" => $runName, - "steps" => array() - ); - - $command["steps"][] = array( - "name" => "Generate determinator", - "shell" => "echo " . $arg_encoded . " | base64 --decode | gzip -d" - . " | base64 --decode", - "determinator" => true, - "user" => "root" - ); - - // Submit to Sandcastle. - $url = 'https://interngraph.intern.facebook.com/sandcastle/create'; - - $job = array( - 'command' => 'SandcastleUniversalCommand', - 'args' => $command, - 'capabilities' => array( - 'vcs' => 'rocksdb-int-git', - 'type' => 'lego', - ), - 'hash' => 'origin/master', - 'user' => $username, - 'alias' => 'rocksdb-precommit', - 'tags' => array('rocksdb'), - 'description' => 'Rocksdb precommit job', - ); - - // Fetch the configuration necessary to submit a successful HTTPS request. - $sandcastle_config = getSandcastleConfig(); - - $app = $sandcastle_config[0]; - $token = $sandcastle_config[1]; - - $cmd = 'curl -s -k ' - . ' -F app=' . escapeshellarg($app) - . ' -F token=' . escapeshellarg($token) - . ' -F job=' . escapeshellarg(json_encode($job)) - .' ' . escapeshellarg($url); - - $output = shell_exec($cmd); - assert(strlen($output) > 0); - - // Extract Sandcastle URL from the response. - preg_match('/url": "(.+)"/', $output, $sandcastle_url); - - assert(count($sandcastle_url) > 0, "Unable to submit Sandcastle request."); - assert(strlen($sandcastle_url[1]) > 0, "Unable to extract Sandcastle URL."); - - if ($applyDiff) { - echo "\nSandcastle URL: " . $sandcastle_url[1] . "\n"; - // Ask Phabricator to display it on the diff UI. - postURL($diffID, $sandcastle_url[1]); - } else { - echo "Continuous integration started Sandcastle tests. You can look at "; - echo "the progress at:\n" . $sandcastle_url[1] . "\n"; - } -} - -// Continuous run cript will set the environment variable and based on that -// we'll trigger the execution of tests in Sandcastle. In that case we don't -// need to apply any diffs and there's no associated workflow either. -if (getenv(ENV_POST_RECEIVE_HOOK)) { - startTestsInSandcastle( - false /* $applyDiff */, - NULL /* $workflow */, - NULL /* $diffID */); -} diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 057f77ec531..45fdbe258ba 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -56,10 +56,10 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then # we need this to build with MySQL. Don't use for other purposes. source "$PWD/build_tools/fbcode_config4.8.1.sh" - elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007" ]; then - source "$PWD/build_tools/fbcode_config_platform007.sh" - else + elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_5xx" ]; then source "$PWD/build_tools/fbcode_config.sh" + else + source "$PWD/build_tools/fbcode_config_platform007.sh" fi fi @@ -150,6 +150,9 @@ case "$TARGET_OS" in PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic" fi PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" + if test -z "$USE_FOLLY_DISTRIBUTED_MUTEX"; then + USE_FOLLY_DISTRIBUTED_MUTEX=1 + fi # PORT_FILES=port/linux/linux_specific.cc ;; SunOS) @@ -397,6 +400,7 @@ EOF #include int main() { size_t res = malloc_usable_size(0); + (void)res; return 0; } EOF @@ -411,6 +415,7 @@ EOF #include int main() { int x = PTHREAD_MUTEX_ADAPTIVE_NP; + (void)x; return 0; } EOF @@ -422,7 +427,7 @@ EOF if ! test $ROCKSDB_DISABLE_BACKTRACE; then # Test whether backtrace is available $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <> + #include int main() { void* frames[1]; backtrace_symbols(frames, backtrace(frames, 1)); @@ -480,6 +485,7 @@ EOF #include int main() { int cpuid = sched_getcpu(); + (void)cpuid; } EOF if [ "$?" = 0 ]; then @@ -515,7 +521,7 @@ fi if test "$USE_HDFS"; then if test -z "$JAVA_HOME"; then - echo "JAVA_HOME has to be set for HDFS usage." + echo "JAVA_HOME has to be set for HDFS usage." >&2 exit 1 fi HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS -I$HADOOP_HOME/include" @@ -527,42 +533,64 @@ if test "$USE_HDFS"; then JAVA_LDFLAGS="$JAVA_LDFLAGS $HDFS_LDFLAGS" fi -if test -z "$PORTABLE"; then +if test "0$PORTABLE" -eq 0; then if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then # Tune for this POWER processor, treating '+' models as base models POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+` COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER " elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then COMMON_FLAGS="$COMMON_FLAGS -march=z10 " - elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then + elif test -n "`echo $TARGET_ARCHITECTURE | grep -e^arm -e^aarch64`"; then # TODO: Handle this with approprite options. COMMON_FLAGS="$COMMON_FLAGS" + elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then + COMMON_FLAGS="$COMMON_FLAGS" elif [ "$TARGET_OS" == "IOS" ]; then COMMON_FLAGS="$COMMON_FLAGS" - elif [ "$TARGET_OS" != "AIX" ] && [ "$TARGET_OS" != "SunOS" ]; then + elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then + # TODO: Not sure why we don't use -march=native on these OSes + if test "$USE_SSE"; then + TRY_SSE_ETC="1" + fi + else COMMON_FLAGS="$COMMON_FLAGS -march=native " - elif test "$USE_SSE"; then - COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul" fi -elif test "$USE_SSE"; then - COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul" +else + # PORTABLE=1 + if test "$USE_SSE"; then + TRY_SSE_ETC="1" + fi fi -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() { volatile uint32_t x = _mm_crc32_u32(0, 0); + (void)x; } EOF if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS -DHAVE_SSE42" + COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42" elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" - exit 1 + echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2 fi -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null </dev/null < #include int main() { @@ -570,13 +598,41 @@ $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <&2 +fi + +$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o /dev/null 2>/dev/null < + #include + int main() { + const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5); + const auto b = _mm256_permutevar8x32_epi32(a, a); + (void)b; + } +EOF +if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2" +elif test "$USE_SSE"; then + echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2 +fi + +$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + uint64_t a = 0xffffFFFFffffFFFF; + __uint128_t b = __uint128_t(a) * a; + a = static_cast(b >> 64); + (void)a; + } +EOF +if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DHAVE_UINT128_EXTENSION" fi # iOS doesn't support thread-local storage, but this check would erroneously @@ -589,6 +645,7 @@ if [ "$PLATFORM" != IOS ]; then #endif int main() { static __thread int tls; + (void)tls; } EOF if [ "$?" = 0 ]; then @@ -596,6 +653,19 @@ EOF fi fi +if [ "$FBCODE_BUILD" != "true" -a "$PLATFORM" = OS_LINUX ]; then + $CXX $COMMON_FLAGS $PLATFORM_SHARED_CFLAGS -x c++ -c - -o test_dl.o 2>/dev/null </dev/null + if [ "$?" = 0 ]; then + EXEC_LDFLAGS+="-ldl" + rm -f test_dl.o + fi + fi +fi + PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" @@ -640,3 +710,6 @@ if test -n "$WITH_JEMALLOC_FLAG"; then echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT" fi echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT" +if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then + echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT" +fi diff --git a/build_tools/cont_integration.sh b/build_tools/cont_integration.sh deleted file mode 100755 index 66d25522785..00000000000 --- a/build_tools/cont_integration.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright (c) 2016, Facebook. All rights reserved. -# -# Overall wrapper script for RocksDB continuous builds. The implementation is a -# trivial pulling scheme. We loop infinitely, check if any new changes have been -# committed, if yes then trigger a Sandcastle run, and finally go to sleep again -# for a certain interval. -# - -SRC_GIT_REPO=/data/git/rocksdb-public -error=0 - -function log { - DATE=`date +%Y-%m-%d:%H:%M:%S` - # shellcheck disable=SC2068 - echo $DATE $@ -} - -function log_err { - # shellcheck disable=SC2145 - log "ERROR: $@ Error code: $error." -} - -function update_repo_status { - # Update the parent first. - pushd $SRC_GIT_REPO - - # This is a fatal error. Something in the environment isn't right and we will - # terminate the execution. - error=$? - if [ ! $error -eq 0 ]; then - log_err "Where is $SRC_GIT_REPO?" - exit $error - fi - - HTTPS_PROXY=fwdproxy:8080 git fetch -f - - error=$? - if [ ! $error -eq 0 ]; then - log_err "git fetch -f failed." - popd - return $error - fi - - git update-ref refs/heads/master refs/remotes/origin/master - - error=$? - if [ ! $error -eq 0 ]; then - log_err "git update-ref failed." - popd - return $error - fi - - popd - - # We're back in an instance-specific directory. Get the latest changes. - git pull --rebase - - error=$? - if [ ! $error -eq 0 ]; then - log_err "git pull --rebase failed." - return $error - fi -} - -# -# Execution starts here. -# - -# Path to the determinator from the root of the RocksDB repo. -CONTRUN_DETERMINATOR=./build_tools/RocksDBCommonHelper.php - -# Value of the previous commit. -PREV_COMMIT= - -log "Starting to monitor for new RocksDB changes ..." -log "Running under `pwd` as `whoami`." - -# Paranoia. Make sure that we're using the right branch. -git checkout master - -error=$? -if [ ! $error -eq 0 ]; then - log_err "This is not good. Can't checkout master. Bye-bye!" - exit 1 -fi - -# We'll run forever and let the execution environment terminate us if we'll -# exceed whatever timeout is set for the job. -while true; -do - # Get the latest changes committed. - update_repo_status - - error=$? - if [ $error -eq 0 ]; then - LAST_COMMIT=`git log -1 | head -1 | grep commit | awk '{ print $2; }'` - - log "Last commit is '$LAST_COMMIT', previous commit is '$PREV_COMMIT'." - - if [ "$PREV_COMMIT" == "$LAST_COMMIT" ]; then - log "There were no changes since the last time I checked. Going to sleep." - else - if [ ! -z "$LAST_COMMIT" ]; then - log "New code has been committed or previous commit not known. " \ - "Will trigger the tests." - - PREV_COMMIT=$LAST_COMMIT - log "Updated previous commit to '$PREV_COMMIT'." - - # - # This is where we'll trigger the Sandcastle run. The values for - # HTTPS_APP_VALUE and HTTPS_APP_VALUE will be set in the container we're - # running in. - # - POST_RECEIVE_HOOK=1 php $CONTRUN_DETERMINATOR - - error=$? - if [ $error -eq 0 ]; then - log "Sandcastle run successfully triggered." - else - log_err "Failed to trigger Sandcastle run." - fi - else - log_err "Previous commit not updated. Don't know what the last one is." - fi - fi - else - log_err "Getting latest changes failed. Will skip running tests for now." - fi - - # Always sleep, even if errors happens while trying to determine the latest - # commit. This will prevent us terminating in case of transient errors. - log "Will go to sleep for 5 minutes." - sleep 5m -done diff --git a/build_tools/dependencies.sh b/build_tools/dependencies.sh index c6e074b6460..22454c76fce 100644 --- a/build_tools/dependencies.sh +++ b/build_tools/dependencies.sh @@ -1,19 +1,19 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -GCC_BASE=/mnt/gvfs/third-party2/gcc/112ec378fec7002ad3e09afde022e656049f7191/5.x/centos7-native/c447969 -CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/04999bdb3ce81a11073535dcb00b5e13dc1cbaf5/stable/centos7-native/c9f9104 -LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/92b0c8e5c8eecc71eb042594ce1ab3413799b385/5.x/gcc-5-glibc-2.23/339d858 -GLIBC_BASE=/mnt/gvfs/third-party2/glibc/3d8698d5973ba94f41620a80a67e4457fdf01e90/2.23/gcc-5-glibc-2.23/ca1d1c0 +GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/5.x/centos7-native/c447969 +CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/1bd23f9917738974ad0ff305aa23eb5f93f18305/9.0.0/centos7-native/c9f9104 +LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/5.x/gcc-5-glibc-2.23/339d858 +GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.23/gcc-5-glibc-2.23/ca1d1c0 SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/gcc-5-glibc-2.23/9bc6787 -ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/gcc-5-glibc-2.23/9bc6787 +ZLIB_BASE=/mnt/gvfs/third-party2/zlib/2d9f0b9a4274cc21f61272a9e89bdb859bce8f1f/1.2.8/gcc-5-glibc-2.23/9bc6787 BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/gcc-5-glibc-2.23/9bc6787 -LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/gcc-5-glibc-2.23/9bc6787 -ZSTD_BASE=/mnt/gvfs/third-party2/zstd/af6628a46758f1a15484a1760cd7294164bc5ba1/1.3.5/gcc-5-glibc-2.23/03859b5 +LZ4_BASE=/mnt/gvfs/third-party2/lz4/0f607f8fc442ea7d6b876931b1898bb573d5e5da/1.9.1/gcc-5-glibc-2.23/9bc6787 +ZSTD_BASE=/mnt/gvfs/third-party2/zstd/ca22bc441a4eb709e9e0b1f9fec9750fed7b31c5/1.4.x/gcc-5-glibc-2.23/03859b5 GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/gcc-5-glibc-2.23/9bc6787 -JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b1a0e56c1e3e6929813a4331ade3a58ff083afbb/master/gcc-5-glibc-2.23/aa64d6b -NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/gcc-5-glibc-2.23/9bc6787 -LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/gcc-5-glibc-2.23/b443de1 -TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/gcc-5-glibc-2.23/9bc6787 -KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e -BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/55031de95a2b46c82948743419a603b3d6aefe28/2.29.1/centos7-native/da39a3e -VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/gcc-5-glibc-2.23/9bc6787 +JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/c26f08f47ac35fc31da2633b7da92d6b863246eb/master/gcc-5-glibc-2.23/0c8f76d +NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2.0.11/gcc-5-glibc-2.23/9bc6787 +LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/gcc-5-glibc-2.23/b443de1 +TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/gcc-5-glibc-2.23/9bc6787 +KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e +BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/2e3cb7d119b3cea5f1e738cc13a1ac69f49eb875/2.29.1/centos7-native/da39a3e +VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/gcc-5-glibc-2.23/9bc6787 LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.2.3/gcc-5-glibc-2.23/65372bd diff --git a/build_tools/dependencies_platform007.sh b/build_tools/dependencies_platform007.sh index 004bccb5365..1de8e785a80 100644 --- a/build_tools/dependencies_platform007.sh +++ b/build_tools/dependencies_platform007.sh @@ -1,19 +1,19 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -GCC_BASE=/mnt/gvfs/third-party2/gcc/6e8e715624fd15256a7970073387793dfcf79b46/7.x/centos7-native/b2ef2b6 -CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/ef37e1faa1c29782abfac1ae65a291b9b7966f6d/stable/centos7-native/c9f9104 -LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/c67031f0f739ac61575a061518d6ef5038f99f90/7.x/platform007/5620abc -GLIBC_BASE=/mnt/gvfs/third-party2/glibc/60d6f124a78798b73944f5ba87c2306ae3460153/2.26/platform007/f259413 +GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/7.x/centos7-native/b2ef2b6 +CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/963d9aeda70cc4779885b1277484fe7544a04e3e/9.0.0/platform007/9e92d53/ +LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/7.x/platform007/5620abc +GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.26/platform007/f259413 SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d -ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/platform007/ca4da3d +ZLIB_BASE=/mnt/gvfs/third-party2/zlib/2d9f0b9a4274cc21f61272a9e89bdb859bce8f1f/1.2.8/platform007/ca4da3d BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/platform007/ca4da3d -LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/platform007/ca4da3d -ZSTD_BASE=/mnt/gvfs/third-party2/zstd/3ee276cbacfad3074e3f07bf826ac47f06970f4e/1.3.5/platform007/15a3614 +LZ4_BASE=/mnt/gvfs/third-party2/lz4/0f607f8fc442ea7d6b876931b1898bb573d5e5da/1.9.1/platform007/ca4da3d +ZSTD_BASE=/mnt/gvfs/third-party2/zstd/ca22bc441a4eb709e9e0b1f9fec9750fed7b31c5/1.4.x/platform007/15a3614 GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/platform007/ca4da3d -JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/9c910d36d6235cc40e8ff559358f1833452300ca/master/platform007/5b0f53e -NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/platform007/ca4da3d -LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/platform007/6f3e0a9 -TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/platform007/ca4da3d -KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/fb/platform007/da39a3e -BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/92ff90349e2f43ea0a8246d8b1cf17b6869013e3/2.29.1/centos7-native/da39a3e -VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/platform007/ca4da3d +JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/c26f08f47ac35fc31da2633b7da92d6b863246eb/master/platform007/c26c002 +NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2.0.11/platform007/ca4da3d +LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/platform007/6f3e0a9 +TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/platform007/ca4da3d +KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/fb/platform007/da39a3e +BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/ab9f09bba370e7066cafd4eb59752db93f2e8312/2.29.1/platform007/15a3614 +VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/platform007/ca4da3d LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832 diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh index 4415f87da38..4834be5f4c1 100644 --- a/build_tools/fbcode_config.sh +++ b/build_tools/fbcode_config.sh @@ -86,9 +86,10 @@ else fi CFLAGS+=" -DTBB" -# use Intel SSE support for checksum calculations -export USE_SSE=1 -export PORTABLE=1 +test "$USE_SSE" || USE_SSE=1 +export USE_SSE +test "$PORTABLE" || PORTABLE=1 +export PORTABLE BINUTILS="$BINUTILS_BASE/bin" AR="$BINUTILS/ar" @@ -159,4 +160,6 @@ else LUA_LIB=" $LUA_PATH/lib/liblua_pic.a" fi +USE_FOLLY_DISTRIBUTED_MUTEX=1 + export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB diff --git a/build_tools/fbcode_config4.8.1.sh b/build_tools/fbcode_config4.8.1.sh index 80fbdf431b9..5f0813a041e 100644 --- a/build_tools/fbcode_config4.8.1.sh +++ b/build_tools/fbcode_config4.8.1.sh @@ -53,9 +53,10 @@ LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a" TBB_INCLUDE=" -isystem $TBB_BASE/include/" TBB_LIBS="$TBB_BASE/lib/libtbb.a" -# use Intel SSE support for checksum calculations -export USE_SSE=1 -export PORTABLE=1 +test "$USE_SSE" || USE_SSE=1 +export USE_SSE +test "$PORTABLE" || PORTABLE=1 +export PORTABLE BINUTILS="$BINUTILS_BASE/bin" AR="$BINUTILS/ar" diff --git a/build_tools/fbcode_config_platform007.sh b/build_tools/fbcode_config_platform007.sh index 1a1e4208139..51edf134fcb 100644 --- a/build_tools/fbcode_config_platform007.sh +++ b/build_tools/fbcode_config_platform007.sh @@ -86,9 +86,10 @@ else fi CFLAGS+=" -DTBB" -# use Intel SSE support for checksum calculations -export USE_SSE=1 -export PORTABLE=1 +test "$USE_SSE" || USE_SSE=1 +export USE_SSE +test "$PORTABLE" || PORTABLE=1 +export PORTABLE BINUTILS="$BINUTILS_BASE/bin" AR="$BINUTILS/ar" @@ -155,4 +156,6 @@ VALGRIND_VER="$VALGRIND_BASE/bin/" LUA_PATH= LUA_LIB= +USE_FOLLY_DISTRIBUTED_MUTEX=1 + export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh index f8fbb64fbae..4c0f0e5d5d2 100755 --- a/build_tools/format-diff.sh +++ b/build_tools/format-diff.sh @@ -54,15 +54,25 @@ fi set -e uncommitted_code=`git diff HEAD` -LAST_MASTER=`git merge-base master HEAD` # If there's no uncommitted changes, we assume user are doing post-commit -# format check, in which case we'll check the modified lines since last commit -# from master. Otherwise, we'll check format of the uncommitted code only. +# format check, in which case we'll try to check the modified lines vs. the +# facebook/rocksdb.git master branch. Otherwise, we'll check format of the +# uncommitted code only. if [ -z "$uncommitted_code" ] then - # Check the format of last commit - diffs=$(git diff -U0 $LAST_MASTER^ | $CLANG_FORMAT_DIFF -p 1) + # Attempt to get name of facebook/rocksdb.git remote. + [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)" + # Fall back on 'origin' if that fails + [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE=origin + # Use master branch from that remote + [ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/master" + # Get the common ancestor with that remote branch. Everything after that + # common ancestor would be considered the contents of a pull request, so + # should be relevant for formatting fixes. + FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)" + # Get the differences + diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1) else # Check the format of uncommitted lines, diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) @@ -76,12 +86,12 @@ fi # Highlight the insertion/deletion from the clang-format-diff.py's output COLOR_END="\033[0m" -COLOR_RED="\033[0;31m" -COLOR_GREEN="\033[0;32m" +COLOR_RED="\033[0;31m" +COLOR_GREEN="\033[0;32m" echo -e "Detect lines that doesn't follow the format rules:\r" # Add the color to the diff. lines added will be green; lines removed will be red. -echo "$diffs" | +echo "$diffs" | sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" | sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/" @@ -104,7 +114,7 @@ fi # Do in-place format adjustment. if [ -z "$uncommitted_code" ] then - git diff -U0 $LAST_MASTER^ | $CLANG_FORMAT_DIFF -i -p 1 + git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1 else git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1 fi diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator index 2447a19ae44..f4ea9ca346e 100755 --- a/build_tools/rocksdb-lego-determinator +++ b/build_tools/rocksdb-lego-determinator @@ -63,6 +63,21 @@ CLEANUP_ENV=" 'user':'root' }" +UPLOAD_DB_DIR=" +{ + 'name':'Upload database directory', + 'shell':'tar -cvzf rocksdb_db.tar.gz /dev/shm/rocksdb/', + 'user':'root', + 'cleanup':true, + 'provide_artifacts': [ + { + 'name':'rocksdb_db_dir', + 'paths': ['rocksdb_db.tar.gz'], + 'bundle': false, + }, + ], +}" + # We will eventually set the RATIO to 1, but we want do this # in steps. RATIO=$(nproc) will make it work as J=1 if [ -z $RATIO ]; then @@ -109,13 +124,6 @@ else TASK_CREATION_TOOL="false" fi -ARTIFACTS=" 'artifacts': [ - { - 'name':'database', - 'paths':[ '/dev/shm/rocksdb' ], - } -]" - # # A mechanism to disable tests temporarily # @@ -140,6 +148,7 @@ UNIT_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -160,6 +169,7 @@ UNIT_TEST_NON_SHM_COMMANDS="[ { 'name':'Rocksdb Unit Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -182,6 +192,7 @@ RELEASE_BUILD_COMMANDS="[ { 'name':'Rocksdb Release Build', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -202,6 +213,7 @@ UNIT_TEST_COMMANDS_481="[ { 'name':'Rocksdb Unit Test on GCC 4.8.1', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -222,6 +234,7 @@ RELEASE_BUILD_COMMANDS_481="[ { 'name':'Rocksdb Release on GCC 4.8.1', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -242,6 +255,7 @@ CLANG_UNIT_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -262,6 +276,7 @@ CLANG_RELEASE_BUILD_COMMANDS="[ { 'name':'Rocksdb CLANG Release Build', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -282,6 +297,7 @@ CLANG_ANALYZE_COMMANDS="[ { 'name':'Rocksdb analyze', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -302,6 +318,7 @@ CODE_COV_COMMANDS="[ { 'name':'Rocksdb Unit Test Code Coverage', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -322,6 +339,7 @@ UNITY_COMMANDS="[ { 'name':'Rocksdb Unity', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -342,6 +360,7 @@ LITE_BUILD_COMMANDS="[ { 'name':'Rocksdb Lite build', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -361,6 +380,7 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[ { 'name':'Rocksdb Lite Binary Size', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -376,8 +396,9 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[ # STRESS_CRASH_TEST_COMMANDS="[ { - 'name':'Rocksdb Stress/Crash Test', + 'name':'Rocksdb Stress and Crash Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -395,7 +416,6 @@ STRESS_CRASH_TEST_COMMANDS="[ $PARSER } ], - $ARTIFACTS, $REPORT } ]" @@ -405,8 +425,9 @@ STRESS_CRASH_TEST_COMMANDS="[ # STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb Stress/Crash Test (atomic flush)', + 'name':'Rocksdb Stress and Crash Test with atomic flush', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -422,9 +443,9 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL', 'user':'root', $PARSER - } + }, + $UPLOAD_DB_DIR, ], - $ARTIFACTS, $REPORT } ]" @@ -436,6 +457,7 @@ WRITE_STRESS_COMMANDS="[ { 'name':'Rocksdb Write Stress Test', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -458,6 +480,7 @@ ASAN_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under ASAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -478,6 +501,7 @@ ASAN_CRASH_TEST_COMMANDS="[ { 'name':'Rocksdb crash test under ASAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -498,8 +522,9 @@ ASAN_CRASH_TEST_COMMANDS="[ # ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb crash test (atomic flush) under ASAN', + 'name':'Rocksdb crash test with atomic flush under ASAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -510,6 +535,7 @@ ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ 'user':'root', $PARSER }, + $UPLOAD_DB_DIR, ], $REPORT } @@ -522,11 +548,12 @@ UBSAN_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under UBSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { 'name':'Test RocksDB debug under UBSAN', - 'shell':'set -o pipefail && $SHM $UBSAN $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL', + 'shell':'set -o pipefail && $SHM $UBSAN $CLANG $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL', 'user':'root', $PARSER } @@ -542,13 +569,14 @@ UBSAN_CRASH_TEST_COMMANDS="[ { 'name':'Rocksdb crash test under UBSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug ubsan_crash_test', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL', + 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL', 'user':'root', $PARSER }, @@ -562,18 +590,20 @@ UBSAN_CRASH_TEST_COMMANDS="[ # UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { - 'name':'Rocksdb crash test (atomic flush) under UBSAN', + 'name':'Rocksdb crash test with atomic flush under UBSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, { 'name':'Build and run RocksDB debug ubsan_crash_test_with_atomic_flush', 'timeout': 86400, - 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', + 'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL', 'user':'root', $PARSER }, + $UPLOAD_DB_DIR, ], $REPORT } @@ -586,6 +616,7 @@ VALGRIND_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under valgrind', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -608,6 +639,7 @@ TSAN_UNIT_TEST_COMMANDS="[ { 'name':'Rocksdb Unit Test under TSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -630,6 +662,7 @@ TSAN_CRASH_TEST_COMMANDS="[ { 'name':'Rocksdb Crash Test under TSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -652,6 +685,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ { 'name':'Rocksdb Crash Test with atomic flush under TSAN', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'timeout': 86400, 'steps': [ $CLEANUP_ENV, @@ -662,6 +696,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[ 'user':'root', $PARSER }, + $UPLOAD_DB_DIR, ], $REPORT } @@ -684,6 +719,7 @@ FORMAT_COMPATIBLE_COMMANDS="[ { 'name':'Rocksdb Format Compatible tests', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -717,6 +753,7 @@ NO_COMPRESSION_COMMANDS="[ { 'name':'Rocksdb No Compression tests', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { @@ -794,6 +831,7 @@ JAVA_BUILD_TEST_COMMANDS="[ { 'name':'Rocksdb Java Build', 'oncall':'$ONCALL', + 'executeLocal': 'true', 'steps': [ $CLEANUP_ENV, { diff --git a/build_tools/update_dependencies.sh b/build_tools/update_dependencies.sh index 9af8c60d1d8..dbc95a6e545 100755 --- a/build_tools/update_dependencies.sh +++ b/build_tools/update_dependencies.sh @@ -6,6 +6,12 @@ BASEDIR=$(dirname $0) OUTPUT="" +function log_header() +{ + echo "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved." >> "$OUTPUT" +} + + function log_variable() { echo "$1=${!1}" >> "$OUTPUT" @@ -69,6 +75,7 @@ echo "Writing dependencies to $OUTPUT" GCC_BASE=`readlink -f $TP2_LATEST/gcc/7.x/centos7-native/*/` CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/` +log_header log_variable GCC_BASE log_variable CLANG_BASE @@ -108,6 +115,7 @@ echo "Writing dependencies to $OUTPUT" GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos7-native/*/` CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/` +log_header log_variable GCC_BASE log_variable CLANG_BASE @@ -147,6 +155,7 @@ echo "Writing 4.8.1 dependencies to $OUTPUT" GCC_BASE=`readlink -f $TP2_LATEST/gcc/4.8.1/centos6-native/*/` CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos6-native/*/` +log_header log_variable GCC_BASE log_variable CLANG_BASE diff --git a/cache/cache_bench.cc b/cache/cache_bench.cc index 098813d9d74..288662ad9df 100644 --- a/cache/cache_bench.cc +++ b/cache/cache_bench.cc @@ -3,9 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif #ifndef GFLAGS #include int main() { @@ -14,9 +11,9 @@ int main() { } #else -#include -#include #include +#include +#include #include "port/port.h" #include "rocksdb/cache.h" diff --git a/cache/cache_test.cc b/cache/cache_test.cc index f9f77234cdb..a0f75bfdc4d 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -16,9 +16,9 @@ #include #include "cache/clock_cache.h" #include "cache/lru_cache.h" +#include "test_util/testharness.h" #include "util/coding.h" #include "util/string_util.h" -#include "util/testharness.h" namespace rocksdb { @@ -86,14 +86,22 @@ class CacheTest : public testing::TestWithParam { return nullptr; } - std::shared_ptr NewCache(size_t capacity, int num_shard_bits, - bool strict_capacity_limit) { + std::shared_ptr NewCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) { auto type = GetParam(); if (type == kLRU) { - return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit); + LRUCacheOptions co; + co.capacity = capacity; + co.num_shard_bits = num_shard_bits; + co.strict_capacity_limit = strict_capacity_limit; + co.high_pri_pool_ratio = 0; + co.metadata_charge_policy = charge_policy; + return NewLRUCache(co); } if (type == kClock) { - return NewClockCache(capacity, num_shard_bits, strict_capacity_limit); + return NewClockCache(capacity, num_shard_bits, strict_capacity_limit, + charge_policy); } return nullptr; } @@ -143,10 +151,15 @@ class CacheTest : public testing::TestWithParam { }; CacheTest* CacheTest::current_; +class LRUCacheTest : public CacheTest {}; + TEST_P(CacheTest, UsageTest) { // cache is std::shared_ptr and will be automatically cleaned up. const uint64_t kCapacity = 100000; - auto cache = NewCache(kCapacity, 8, false); + auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); + auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata); + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_EQ(0, precise_cache->GetUsage()); size_t usage = 0; char value[10] = "abcdef"; @@ -155,31 +168,45 @@ TEST_P(CacheTest, UsageTest) { std::string key(i, 'a'); auto kv_size = key.size() + 5; cache->Insert(key, reinterpret_cast(value), kv_size, dumbDeleter); + precise_cache->Insert(key, reinterpret_cast(value), kv_size, + dumbDeleter); usage += kv_size; ASSERT_EQ(usage, cache->GetUsage()); + ASSERT_LT(usage, precise_cache->GetUsage()); } + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_EQ(0, precise_cache->GetUsage()); + // make sure the cache will be overloaded for (uint64_t i = 1; i < kCapacity; ++i) { auto key = ToString(i); cache->Insert(key, reinterpret_cast(value), key.size() + 5, dumbDeleter); + precise_cache->Insert(key, reinterpret_cast(value), key.size() + 5, + dumbDeleter); } // the usage should be close to the capacity ASSERT_GT(kCapacity, cache->GetUsage()); + ASSERT_GT(kCapacity, precise_cache->GetUsage()); ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); + ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage()); } TEST_P(CacheTest, PinnedUsageTest) { // cache is std::shared_ptr and will be automatically cleaned up. - const uint64_t kCapacity = 100000; - auto cache = NewCache(kCapacity, 8, false); + const uint64_t kCapacity = 200000; + auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); + auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata); size_t pinned_usage = 0; char value[10] = "abcdef"; std::forward_list unreleased_handles; + std::forward_list unreleased_handles_in_precise_cache; // Add entries. Unpin some of them after insertion. Then, pin some of them // again. Check GetPinnedUsage(). @@ -187,40 +214,72 @@ TEST_P(CacheTest, PinnedUsageTest) { std::string key(i, 'a'); auto kv_size = key.size() + 5; Cache::Handle* handle; + Cache::Handle* handle_in_precise_cache; cache->Insert(key, reinterpret_cast(value), kv_size, dumbDeleter, &handle); + assert(handle); + precise_cache->Insert(key, reinterpret_cast(value), kv_size, + dumbDeleter, &handle_in_precise_cache); + assert(handle_in_precise_cache); pinned_usage += kv_size; ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); if (i % 2 == 0) { cache->Release(handle); + precise_cache->Release(handle_in_precise_cache); pinned_usage -= kv_size; ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); } else { unreleased_handles.push_front(handle); + unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache); } if (i % 3 == 0) { unreleased_handles.push_front(cache->Lookup(key)); + auto x = precise_cache->Lookup(key); + assert(x); + unreleased_handles_in_precise_cache.push_front(x); // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned // usage increased if (i % 2 == 0) { pinned_usage += kv_size; } ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage()); } } + auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage(); + ASSERT_LT(pinned_usage, precise_cache_pinned_usage); // check that overloading the cache does not change the pinned usage for (uint64_t i = 1; i < 2 * kCapacity; ++i) { auto key = ToString(i); cache->Insert(key, reinterpret_cast(value), key.size() + 5, dumbDeleter); + precise_cache->Insert(key, reinterpret_cast(value), key.size() + 5, + dumbDeleter); } ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage()); + + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); + ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage()); // release handles for pinned entries to prevent memory leaks for (auto handle : unreleased_handles) { cache->Release(handle); } + for (auto handle : unreleased_handles_in_precise_cache) { + precise_cache->Release(handle); + } + ASSERT_EQ(0, cache->GetPinnedUsage()); + ASSERT_EQ(0, precise_cache->GetPinnedUsage()); + cache->EraseUnRefEntries(); + precise_cache->EraseUnRefEntries(); + ASSERT_EQ(0, cache->GetUsage()); + ASSERT_EQ(0, precise_cache->GetUsage()); } TEST_P(CacheTest, HitAndMiss) { @@ -306,7 +365,7 @@ TEST_P(CacheTest, EvictionPolicy) { Insert(200, 201); // Frequently used entry must be kept around - for (int i = 0; i < kCacheSize + 200; i++) { + for (int i = 0; i < kCacheSize * 2; i++) { Insert(1000+i, 2000+i); ASSERT_EQ(101, Lookup(100)); } @@ -359,7 +418,7 @@ TEST_P(CacheTest, EvictionPolicyRef) { Insert(303, 104); // Insert entries much more than Cache capacity - for (int i = 0; i < kCacheSize + 200; i++) { + for (int i = 0; i < kCacheSize * 2; i++) { Insert(1000 + i, 2000 + i); } @@ -550,10 +609,10 @@ TEST_P(CacheTest, SetCapacity) { } } -TEST_P(CacheTest, SetStrictCapacityLimit) { +TEST_P(LRUCacheTest, SetStrictCapacityLimit) { // test1: set the flag to false. Insert more keys than capacity. See if they // all go through. - std::shared_ptr cache = NewLRUCache(5, 0, false); + std::shared_ptr cache = NewCache(5, 0, false); std::vector handles(10); Status s; for (size_t i = 0; i < 10; i++) { @@ -562,6 +621,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) { ASSERT_OK(s); ASSERT_NE(nullptr, handles[i]); } + ASSERT_EQ(10, cache->GetUsage()); // test2: set the flag to true. Insert and check if it fails. std::string extra_key = "extra"; @@ -571,13 +631,14 @@ TEST_P(CacheTest, SetStrictCapacityLimit) { s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle); ASSERT_TRUE(s.IsIncomplete()); ASSERT_EQ(nullptr, handle); + ASSERT_EQ(10, cache->GetUsage()); for (size_t i = 0; i < 10; i++) { cache->Release(handles[i]); } // test3: init with flag being true. - std::shared_ptr cache2 = NewLRUCache(5, 0, true); + std::shared_ptr cache2 = NewCache(5, 0, true); for (size_t i = 0; i < 5; i++) { std::string key = ToString(i + 1); s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); @@ -591,7 +652,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) { s = cache2->Insert(extra_key, extra_value, 1, &deleter); // AS if the key have been inserted into cache but get evicted immediately. ASSERT_OK(s); - ASSERT_EQ(5, cache->GetUsage()); + ASSERT_EQ(5, cache2->GetUsage()); ASSERT_EQ(nullptr, cache2->Lookup(extra_key)); for (size_t i = 0; i < 5; i++) { @@ -686,14 +747,23 @@ TEST_P(CacheTest, DefaultShardBits) { ASSERT_EQ(6, sc->GetNumShardBits()); } +TEST_P(CacheTest, GetCharge) { + Insert(1, 2); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(1)); + ASSERT_EQ(2, DecodeValue(cache_->Value(h1))); + ASSERT_EQ(1, cache_->GetCharge(h1)); + cache_->Release(h1); +} + #ifdef SUPPORT_CLOCK_CACHE -std::shared_ptr (*new_clock_cache_func)(size_t, int, - bool) = NewClockCache; +std::shared_ptr (*new_clock_cache_func)( + size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache; INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU, kClock)); #else INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU)); #endif // SUPPORT_CLOCK_CACHE +INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU)); } // namespace rocksdb diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 89173834e23..9165ad5dd10 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -13,8 +13,9 @@ namespace rocksdb { -std::shared_ptr NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/, - bool /*strict_capacity_limit*/) { +std::shared_ptr NewClockCache( + size_t /*capacity*/, int /*num_shard_bits*/, bool /*strict_capacity_limit*/, + CacheMetadataChargePolicy /*metadata_charge_policy*/) { // Clock cache not supported. return nullptr; } @@ -35,6 +36,7 @@ std::shared_ptr NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/ #include "tbb/concurrent_hash_map.h" #include "cache/sharded_cache.h" +#include "port/malloc.h" #include "port/port.h" #include "util/autovector.h" #include "util/mutexlock.h" @@ -202,6 +204,27 @@ struct CacheHandle { deleter = a.deleter; return *this; } + + inline static size_t CalcTotalCharge( + Slice key, size_t charge, + CacheMetadataChargePolicy metadata_charge_policy) { + size_t meta_charge = 0; + if (metadata_charge_policy == kFullChargeCacheMetadata) { + meta_charge += sizeof(CacheHandle); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + meta_charge += + malloc_usable_size(static_cast(const_cast(key.data()))); +#else + meta_charge += key.size(); +#endif + } + return charge + meta_charge; + } + + inline size_t CalcTotalCharge( + CacheMetadataChargePolicy metadata_charge_policy) { + return CalcTotalCharge(key, charge, metadata_charge_policy); + } }; // Key of hash map. We store hash value with the key for convenience. @@ -404,11 +427,12 @@ void ClockCacheShard::RecycleHandle(CacheHandle* handle, assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0); context->to_delete_key.push_back(handle->key.data()); context->to_delete_value.emplace_back(*handle); + size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); handle->key.clear(); handle->value = nullptr; handle->deleter = nullptr; recycle_.push_back(handle); - usage_.fetch_sub(handle->charge, std::memory_order_relaxed); + usage_.fetch_sub(total_charge, std::memory_order_relaxed); } void ClockCacheShard::Cleanup(const CleanupContext& context) { @@ -434,7 +458,8 @@ bool ClockCacheShard::Ref(Cache::Handle* h) { std::memory_order_relaxed)) { if (CountRefs(flags) == 0) { // No reference count before the operation. - pinned_usage_.fetch_add(handle->charge, std::memory_order_relaxed); + size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); + pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); } return true; } @@ -454,7 +479,8 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage, assert(CountRefs(flags) > 0); if (CountRefs(flags) == 1) { // this is the last reference. - pinned_usage_.fetch_sub(handle->charge, std::memory_order_relaxed); + size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_); + pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed); // Cleanup if it is the last reference. if (!InCache(flags)) { MutexLock l(&mutex_); @@ -539,8 +565,10 @@ CacheHandle* ClockCacheShard::Insert( const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value), bool hold_reference, CleanupContext* context) { + size_t total_charge = + CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_); MutexLock l(&mutex_); - bool success = EvictFromCache(charge, context); + bool success = EvictFromCache(total_charge, context); bool strict = strict_capacity_limit_.load(std::memory_order_relaxed); if (!success && (strict || !hold_reference)) { context->to_delete_key.push_back(key.data()); @@ -575,9 +603,9 @@ CacheHandle* ClockCacheShard::Insert( } table_.insert(HashTable::value_type(CacheKey(key, hash), handle)); if (hold_reference) { - pinned_usage_.fetch_add(charge, std::memory_order_relaxed); + pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed); } - usage_.fetch_add(charge, std::memory_order_relaxed); + usage_.fetch_add(total_charge, std::memory_order_relaxed); return handle; } @@ -674,10 +702,14 @@ void ClockCacheShard::EraseUnRefEntries() { class ClockCache final : public ShardedCache { public: - ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit) + ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) { int num_shards = 1 << num_shard_bits; shards_ = new ClockCacheShard[num_shards]; + for (int i = 0; i < num_shards; i++) { + shards_[i].set_metadata_charge_policy(metadata_charge_policy); + } SetCapacity(capacity); SetStrictCapacityLimit(strict_capacity_limit); } @@ -714,13 +746,14 @@ class ClockCache final : public ShardedCache { } // end anonymous namespace -std::shared_ptr NewClockCache(size_t capacity, int num_shard_bits, - bool strict_capacity_limit) { +std::shared_ptr NewClockCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) { if (num_shard_bits < 0) { num_shard_bits = GetDefaultCacheShardBits(capacity); } - return std::make_shared(capacity, num_shard_bits, - strict_capacity_limit); + return std::make_shared( + capacity, num_shard_bits, strict_capacity_limit, metadata_charge_policy); } } // namespace rocksdb diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index fdcbb4e86cb..0e49167ed5b 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "cache/lru_cache.h" #include @@ -28,7 +24,7 @@ LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) { LRUHandleTable::~LRUHandleTable() { ApplyToAllCacheEntries([](LRUHandle* h) { - if (h->refs == 1) { + if (!h->HasRefs()) { h->Free(); } }); @@ -101,7 +97,8 @@ void LRUHandleTable::Resize() { LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio, - bool use_adaptive_mutex) + bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy) : capacity_(0), high_pri_pool_usage_(0), strict_capacity_limit_(strict_capacity_limit), @@ -110,6 +107,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, usage_(0), lru_usage_(0), mutex_(use_adaptive_mutex) { + set_metadata_charge_policy(metadata_charge_policy); // Make empty circular linked list lru_.next = &lru_; lru_.prev = &lru_; @@ -117,30 +115,20 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, SetCapacity(capacity); } -LRUCacheShard::~LRUCacheShard() {} - -bool LRUCacheShard::Unref(LRUHandle* e) { - assert(e->refs > 0); - e->refs--; - return e->refs == 0; -} - -// Call deleter and free - void LRUCacheShard::EraseUnRefEntries() { autovector last_reference_list; { MutexLock l(&mutex_); while (lru_.next != &lru_) { LRUHandle* old = lru_.next; - assert(old->InCache()); - assert(old->refs == - 1); // LRU list contains elements which may be evicted + // LRU list contains only elements which can be evicted + assert(old->InCache() && !old->HasRefs()); LRU_Remove(old); table_.Remove(old->key(), old->hash); old->SetInCache(false); - Unref(old); - usage_ -= old->charge; + size_t total_charge = old->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= total_charge); + usage_ -= total_charge; last_reference_list.push_back(old); } } @@ -152,22 +140,27 @@ void LRUCacheShard::EraseUnRefEntries() { void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t), bool thread_safe) { + const auto applyCallback = [&]() { + table_.ApplyToAllCacheEntries( + [callback](LRUHandle* h) { callback(h->value, h->charge); }); + }; + if (thread_safe) { - mutex_.Lock(); - } - table_.ApplyToAllCacheEntries( - [callback](LRUHandle* h) { callback(h->value, h->charge); }); - if (thread_safe) { - mutex_.Unlock(); + MutexLock l(&mutex_); + applyCallback(); + } else { + applyCallback(); } } void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) { + MutexLock l(&mutex_); *lru = &lru_; *lru_low_pri = lru_low_pri_; } size_t LRUCacheShard::TEST_GetLRUSize() { + MutexLock l(&mutex_); LRUHandle* lru_handle = lru_.next; size_t lru_size = 0; while (lru_handle != &lru_) { @@ -191,16 +184,19 @@ void LRUCacheShard::LRU_Remove(LRUHandle* e) { e->next->prev = e->prev; e->prev->next = e->next; e->prev = e->next = nullptr; - lru_usage_ -= e->charge; + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); + assert(lru_usage_ >= total_charge); + lru_usage_ -= total_charge; if (e->InHighPriPool()) { - assert(high_pri_pool_usage_ >= e->charge); - high_pri_pool_usage_ -= e->charge; + assert(high_pri_pool_usage_ >= total_charge); + high_pri_pool_usage_ -= total_charge; } } void LRUCacheShard::LRU_Insert(LRUHandle* e) { assert(e->next == nullptr); assert(e->prev == nullptr); + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) { // Inset "e" to head of LRU list. e->next = &lru_; @@ -208,7 +204,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) { e->prev->next = e; e->next->prev = e; e->SetInHighPriPool(true); - high_pri_pool_usage_ += e->charge; + high_pri_pool_usage_ += total_charge; MaintainPoolSize(); } else { // Insert "e" to the head of low-pri pool. Note that when @@ -220,7 +216,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) { e->SetInHighPriPool(false); lru_low_pri_ = e; } - lru_usage_ += e->charge; + lru_usage_ += total_charge; } void LRUCacheShard::MaintainPoolSize() { @@ -229,21 +225,25 @@ void LRUCacheShard::MaintainPoolSize() { lru_low_pri_ = lru_low_pri_->next; assert(lru_low_pri_ != &lru_); lru_low_pri_->SetInHighPriPool(false); - high_pri_pool_usage_ -= lru_low_pri_->charge; + size_t total_charge = + lru_low_pri_->CalcTotalCharge(metadata_charge_policy_); + assert(high_pri_pool_usage_ >= total_charge); + high_pri_pool_usage_ -= total_charge; } } void LRUCacheShard::EvictFromLRU(size_t charge, autovector* deleted) { - while (usage_ + charge > capacity_ && lru_.next != &lru_) { + while ((usage_ + charge) > capacity_ && lru_.next != &lru_) { LRUHandle* old = lru_.next; - assert(old->InCache()); - assert(old->refs == 1); // LRU list contains elements which may be evicted + // LRU list contains only elements which can be evicted + assert(old->InCache() && !old->HasRefs()); LRU_Remove(old); table_.Remove(old->key(), old->hash); old->SetInCache(false); - Unref(old); - usage_ -= old->charge; + size_t old_total_charge = old->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= old_total_charge); + usage_ -= old_total_charge; deleted->push_back(old); } } @@ -256,8 +256,8 @@ void LRUCacheShard::SetCapacity(size_t capacity) { high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_; EvictFromLRU(0, &last_reference_list); } - // we free the entries here outside of mutex for - // performance reasons + + // Free the entries outside of mutex for performance reasons for (auto entry : last_reference_list) { entry->Free(); } @@ -273,22 +273,22 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) { LRUHandle* e = table_.Lookup(key, hash); if (e != nullptr) { assert(e->InCache()); - if (e->refs == 1) { + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external references LRU_Remove(e); } - e->refs++; + e->Ref(); e->SetHit(); } return reinterpret_cast(e); } bool LRUCacheShard::Ref(Cache::Handle* h) { - LRUHandle* handle = reinterpret_cast(h); + LRUHandle* e = reinterpret_cast(h); MutexLock l(&mutex_); - if (handle->InCache() && handle->refs == 1) { - LRU_Remove(handle); - } - handle->refs++; + // To create another reference - entry must be already externally referenced + assert(e->HasRefs()); + e->Ref(); return true; } @@ -307,30 +307,29 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) { bool last_reference = false; { MutexLock l(&mutex_); - last_reference = Unref(e); - if (last_reference) { - usage_ -= e->charge; - } - if (e->refs == 1 && e->InCache()) { + last_reference = e->Unref(); + if (last_reference && e->InCache()) { // The item is still in cache, and nobody else holds a reference to it if (usage_ > capacity_ || force_erase) { - // the cache is full // The LRU list must be empty since the cache is full - assert(!(usage_ > capacity_) || lru_.next == &lru_); - // take this opportunity and remove the item + assert(lru_.next == &lru_ || force_erase); + // Take this opportunity and remove the item table_.Remove(e->key(), e->hash); e->SetInCache(false); - Unref(e); - usage_ -= e->charge; - last_reference = true; } else { - // put the item on the list to be potentially freed + // Put the item back on the LRU list, and don't free it LRU_Insert(e); + last_reference = false; } } + if (last_reference) { + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= total_charge); + usage_ -= total_charge; + } } - // free outside of mutex + // Free the entry here outside of mutex for performance reasons if (last_reference) { e->Free(); } @@ -346,7 +345,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, // It shouldn't happen very often though. LRUHandle* e = reinterpret_cast( new char[sizeof(LRUHandle) - 1 + key.size()]); - Status s; + Status s = Status::OK(); autovector last_reference_list; e->value = value; @@ -355,26 +354,26 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, e->key_length = key.size(); e->flags = 0; e->hash = hash; - e->refs = (handle == nullptr - ? 1 - : 2); // One from LRUCache, one for the returned handle + e->refs = 0; e->next = e->prev = nullptr; e->SetInCache(true); e->SetPriority(priority); memcpy(e->key_data, key.data(), key.size()); + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); { MutexLock l(&mutex_); // Free the space following strict LRU policy until enough space // is freed or the lru list is empty - EvictFromLRU(charge, &last_reference_list); + EvictFromLRU(total_charge, &last_reference_list); - if (usage_ - lru_usage_ + charge > capacity_ && + if ((usage_ + total_charge) > capacity_ && (strict_capacity_limit_ || handle == nullptr)) { if (handle == nullptr) { // Don't insert the entry but still return ok, as if the entry inserted // into cache and get evicted immediately. + e->SetInCache(false); last_reference_list.push_back(e); } else { delete[] reinterpret_cast(e); @@ -382,32 +381,33 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, s = Status::Incomplete("Insert failed due to LRU cache being full."); } } else { - // insert into the cache - // note that the cache might get larger than its capacity if not enough - // space was freed + // Insert into the cache. Note that the cache might get larger than its + // capacity if not enough space was freed up. LRUHandle* old = table_.Insert(e); - usage_ += e->charge; + usage_ += total_charge; if (old != nullptr) { + assert(old->InCache()); old->SetInCache(false); - if (Unref(old)) { - usage_ -= old->charge; - // old is on LRU because it's in cache and its reference count - // was just 1 (Unref returned 0) + if (!old->HasRefs()) { + // old is on LRU because it's in cache and its reference count is 0 LRU_Remove(old); + size_t old_total_charge = + old->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= old_total_charge); + usage_ -= old_total_charge; last_reference_list.push_back(old); } } if (handle == nullptr) { LRU_Insert(e); } else { + e->Ref(); *handle = reinterpret_cast(e); } - s = Status::OK(); } } - // we free the entries here outside of mutex for - // performance reasons + // Free the entries here outside of mutex for performance reasons for (auto entry : last_reference_list) { entry->Free(); } @@ -422,18 +422,20 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { MutexLock l(&mutex_); e = table_.Remove(key, hash); if (e != nullptr) { - last_reference = Unref(e); - if (last_reference) { - usage_ -= e->charge; - } - if (last_reference && e->InCache()) { + assert(e->InCache()); + e->SetInCache(false); + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external references LRU_Remove(e); + size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_); + assert(usage_ >= total_charge); + usage_ -= total_charge; + last_reference = true; } - e->SetInCache(false); } } - // mutex not held here + // Free the entry here outside of mutex for performance reasons // last_reference will only be true if e != nullptr if (last_reference) { e->Free(); @@ -465,7 +467,8 @@ std::string LRUCacheShard::GetPrintableOptions() const { LRUCache::LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, std::shared_ptr allocator, - bool use_adaptive_mutex) + bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy) : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, std::move(allocator)) { num_shards_ = 1 << num_shard_bits; @@ -475,7 +478,7 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits, for (int i = 0; i < num_shards_; i++) { new (&shards_[i]) LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio, - use_adaptive_mutex); + use_adaptive_mutex, metadata_charge_policy); } } @@ -544,15 +547,15 @@ std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits, cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, - cache_opts.memory_allocator, - cache_opts.use_adaptive_mutex); + cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, + cache_opts.metadata_charge_policy); } std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, - std::shared_ptr memory_allocator, - bool use_adaptive_mutex) { + std::shared_ptr memory_allocator, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy) { if (num_shard_bits >= 20) { return nullptr; // the cache cannot be sharded into too many fine pieces } @@ -563,10 +566,9 @@ std::shared_ptr NewLRUCache( if (num_shard_bits < 0) { num_shard_bits = GetDefaultCacheShardBits(capacity); } - return std::make_shared(capacity, num_shard_bits, - strict_capacity_limit, high_pri_pool_ratio, - std::move(memory_allocator), - use_adaptive_mutex); + return std::make_shared( + capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, + std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy); } } // namespace rocksdb diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 0d9a317486e..6313c69dba9 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -12,36 +12,40 @@ #include "cache/sharded_cache.h" +#include "port/malloc.h" #include "port/port.h" #include "util/autovector.h" namespace rocksdb { -// LRU cache implementation +// LRU cache implementation. This class is not thread-safe. // An entry is a variable length heap-allocated structure. // Entries are referenced by cache and/or by any external entity. -// The cache keeps all its entries in table. Some elements +// The cache keeps all its entries in a hash table. Some elements // are also stored on LRU list. // // LRUHandle can be in these states: // 1. Referenced externally AND in hash table. -// In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true) -// 2. Not referenced externally and in hash table. In that case the entry is -// in the LRU and can be freed. (refs == 1 && in_cache == true) -// 3. Referenced externally and not in hash table. In that case the entry is -// in not on LRU and not in table. (refs >= 1 && in_cache == false) +// In that case the entry is *not* in the LRU list +// (refs >= 1 && in_cache == true) +// 2. Not referenced externally AND in hash table. +// In that case the entry is in the LRU list and can be freed. +// (refs == 0 && in_cache == true) +// 3. Referenced externally AND not in hash table. +// In that case the entry is not in the LRU list and not in hash table. +// The entry can be freed when refs becomes 0. +// (refs >= 1 && in_cache == false) // // All newly created LRUHandles are in state 1. If you call -// LRUCacheShard::Release -// on entry in state 1, it will go into state 2. To move from state 1 to -// state 3, either call LRUCacheShard::Erase or LRUCacheShard::Insert with the -// same key. +// LRUCacheShard::Release on entry in state 1, it will go into state 2. +// To move from state 1 to state 3, either call LRUCacheShard::Erase or +// LRUCacheShard::Insert with the same key (but possibly different value). // To move from state 2 to state 1, use LRUCacheShard::Lookup. // Before destruction, make sure that no handles are in state 1. This means // that any successful LRUCacheShard::Lookup/LRUCacheShard::Insert have a -// matching -// RUCache::Release (to move into state 2) or LRUCacheShard::Erase (for state 3) +// matching LRUCache::Release (to move into state 2) or LRUCacheShard::Erase +// (to move into state 3). struct LRUHandle { void* value; @@ -51,37 +55,42 @@ struct LRUHandle { LRUHandle* prev; size_t charge; // TODO(opt): Only allow uint32_t? size_t key_length; - uint32_t refs; // a number of refs to this entry - // cache itself is counted as 1 - - // Include the following flags: - // IN_CACHE: whether this entry is referenced by the hash table. - // IS_HIGH_PRI: whether this entry is high priority entry. - // IN_HIGH_PRI_POOL: whether this entry is in high-pri pool. - // HAS_HIT: whether this entry has had any lookups (hits). + // The hash of key(). Used for fast sharding and comparisons. + uint32_t hash; + // The number of external refs to this entry. The cache itself is not counted. + uint32_t refs; + enum Flags : uint8_t { + // Whether this entry is referenced by the hash table. IN_CACHE = (1 << 0), + // Whether this entry is high priority entry. IS_HIGH_PRI = (1 << 1), + // Whether this entry is in high-pri pool. IN_HIGH_PRI_POOL = (1 << 2), + // Wwhether this entry has had any lookups (hits). HAS_HIT = (1 << 3), }; uint8_t flags; - uint32_t hash; // Hash of key(); used for fast sharding and comparisons + // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!) + char key_data[1]; - char key_data[1]; // Beginning of key + Slice key() const { return Slice(key_data, key_length); } - Slice key() const { - // For cheaper lookups, we allow a temporary Handle object - // to store a pointer to a key in "value". - if (next == this) { - return *(reinterpret_cast(value)); - } else { - return Slice(key_data, key_length); - } + // Increase the reference count by 1. + void Ref() { refs++; } + + // Just reduce the reference count by 1. Return true if it was last reference. + bool Unref() { + assert(refs > 0); + refs--; + return refs == 0; } + // Return true if there are external refs, false otherwise. + bool HasRefs() const { return refs > 0; } + bool InCache() const { return flags & IN_CACHE; } bool IsHighPri() const { return flags & IS_HIGH_PRI; } bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; } @@ -114,12 +123,28 @@ struct LRUHandle { void SetHit() { flags |= HAS_HIT; } void Free() { - assert((refs == 1 && InCache()) || (refs == 0 && !InCache())); + assert(refs == 0); if (deleter) { (*deleter)(key(), value); } delete[] reinterpret_cast(this); } + + // Caclculate the memory usage by metadata + inline size_t CalcTotalCharge( + CacheMetadataChargePolicy metadata_charge_policy) { + assert(key_length); + size_t meta_charge = 0; + if (metadata_charge_policy == kFullChargeCacheMetadata) { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + meta_charge += malloc_usable_size(static_cast(this)); +#else + // This is the size that is used when a new handle is created + meta_charge += sizeof(LRUHandle) - 1 + key_length; +#endif + } + return charge + meta_charge; + } }; // We provide our own simple hash table since it removes a whole bunch @@ -168,8 +193,9 @@ class LRUHandleTable { class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { public: LRUCacheShard(size_t capacity, bool strict_capacity_limit, - double high_pri_pool_ratio, bool use_adaptive_mutex); - virtual ~LRUCacheShard(); + double high_pri_pool_ratio, bool use_adaptive_mutex, + CacheMetadataChargePolicy metadata_charge_policy); + virtual ~LRUCacheShard() override = default; // Separate from constructor so caller can easily make an array of LRUCache // if current usage is more than new capacity, the function will attempt to @@ -225,10 +251,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard { // high-pri pool is no larger than the size specify by high_pri_pool_pct. void MaintainPoolSize(); - // Just reduce the reference count by 1. - // Return true if last reference - bool Unref(LRUHandle* e); - // Free some space following strict LRU policy until enough space // to hold (usage_ + charge) is freed or the lru list is empty // This function is not thread safe - it needs to be executed while @@ -293,7 +315,9 @@ class LRUCache LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, double high_pri_pool_ratio, std::shared_ptr memory_allocator = nullptr, - bool use_adaptive_mutex = kDefaultToAdaptiveMutex); + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDontChargeCacheMetadata); virtual ~LRUCache(); virtual const char* Name() const override { return "LRUCache"; } virtual CacheShard* GetShard(int shard) override; diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 9980dd72b7b..f4f4dee69c3 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -8,7 +8,7 @@ #include #include #include "port/port.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { @@ -31,7 +31,8 @@ class LRUCacheTest : public testing::Test { cache_ = reinterpret_cast( port::cacheline_aligned_alloc(sizeof(LRUCacheShard))); new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/, - high_pri_pool_ratio, use_adaptive_mutex); + high_pri_pool_ratio, use_adaptive_mutex, + kDontChargeCacheMetadata); } void Insert(const std::string& key, diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc index a48a32185bf..8fc0a7a17a3 100644 --- a/cache/sharded_cache.cc +++ b/cache/sharded_cache.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "cache/sharded_cache.h" #include diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index 920898b871f..4a396bd47ff 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -40,6 +40,13 @@ class CacheShard { bool thread_safe) = 0; virtual void EraseUnRefEntries() = 0; virtual std::string GetPrintableOptions() const { return ""; } + void set_metadata_charge_policy( + CacheMetadataChargePolicy metadata_charge_policy) { + metadata_charge_policy_ = metadata_charge_policy; + } + + protected: + CacheMetadataChargePolicy metadata_charge_policy_ = kDontChargeCacheMetadata; }; // Generic cache interface which shards cache by hash of keys. 2^num_shard_bits @@ -54,7 +61,8 @@ class ShardedCache : public Cache { virtual CacheShard* GetShard(int shard) = 0; virtual const CacheShard* GetShard(int shard) const = 0; virtual void* Value(Handle* handle) override = 0; - virtual size_t GetCharge(Handle* handle) const = 0; + virtual size_t GetCharge(Handle* handle) const override = 0; + virtual uint32_t GetHash(Handle* handle) const = 0; virtual void DisownData() override = 0; diff --git a/cmake/modules/FindJeMalloc.cmake b/cmake/modules/FindJeMalloc.cmake index 7911f77c4c3..f695b3ed1b3 100644 --- a/cmake/modules/FindJeMalloc.cmake +++ b/cmake/modules/FindJeMalloc.cmake @@ -1,21 +1,29 @@ # - Find JeMalloc library # Find the native JeMalloc includes and library # -# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc. -# JEMALLOC_LIBRARIES - List of libraries when using jemalloc. -# JEMALLOC_FOUND - True if jemalloc found. +# JeMalloc_INCLUDE_DIRS - where to find jemalloc.h, etc. +# JeMalloc_LIBRARIES - List of libraries when using jemalloc. +# JeMalloc_FOUND - True if jemalloc found. -find_path(JEMALLOC_INCLUDE_DIR +find_path(JeMalloc_INCLUDE_DIRS NAMES jemalloc/jemalloc.h HINTS ${JEMALLOC_ROOT_DIR}/include) -find_library(JEMALLOC_LIBRARIES +find_library(JeMalloc_LIBRARIES NAMES jemalloc HINTS ${JEMALLOC_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR) +find_package_handle_standard_args(JeMalloc DEFAULT_MSG JeMalloc_LIBRARIES JeMalloc_INCLUDE_DIRS) mark_as_advanced( - JEMALLOC_LIBRARIES - JEMALLOC_INCLUDE_DIR) + JeMalloc_LIBRARIES + JeMalloc_INCLUDE_DIRS) + +if(JeMalloc_FOUND AND NOT (TARGET JeMalloc::JeMalloc)) + add_library (JeMalloc::JeMalloc UNKNOWN IMPORTED) + set_target_properties(JeMalloc::JeMalloc + PROPERTIES + IMPORTED_LOCATION ${JeMalloc_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${JeMalloc_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/FindNUMA.cmake b/cmake/modules/FindNUMA.cmake index 02760344c68..69b95c9b60b 100644 --- a/cmake/modules/FindNUMA.cmake +++ b/cmake/modules/FindNUMA.cmake @@ -1,11 +1,11 @@ # - Find NUMA # Find the NUMA library and includes # -# NUMA_INCLUDE_DIR - where to find numa.h, etc. +# NUMA_INCLUDE_DIRS - where to find numa.h, etc. # NUMA_LIBRARIES - List of libraries when using NUMA. # NUMA_FOUND - True if NUMA found. -find_path(NUMA_INCLUDE_DIR +find_path(NUMA_INCLUDE_DIRS NAMES numa.h numaif.h HINTS ${NUMA_ROOT_DIR}/include) @@ -14,8 +14,16 @@ find_library(NUMA_LIBRARIES HINTS ${NUMA_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIR) +find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIRS) mark_as_advanced( NUMA_LIBRARIES - NUMA_INCLUDE_DIR) + NUMA_INCLUDE_DIRS) + +if(NUMA_FOUND AND NOT (TARGET NUMA::NUMA)) + add_library (NUMA::NUMA UNKNOWN IMPORTED) + set_target_properties(NUMA::NUMA + PROPERTIES + IMPORTED_LOCATION ${NUMA_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/FindTBB.cmake b/cmake/modules/FindTBB.cmake index 556ce872b17..f6861fa5521 100644 --- a/cmake/modules/FindTBB.cmake +++ b/cmake/modules/FindTBB.cmake @@ -1,7 +1,7 @@ # - Find TBB # Find the Thread Building Blocks library and includes # -# TBB_INCLUDE_DIR - where to find tbb.h, etc. +# TBB_INCLUDE_DIRS - where to find tbb.h, etc. # TBB_LIBRARIES - List of libraries when using TBB. # TBB_FOUND - True if TBB found. @@ -9,17 +9,25 @@ if(NOT DEFINED TBB_ROOT_DIR) set(TBB_ROOT_DIR "$ENV{TBBROOT}") endif() -find_path(TBB_INCLUDE_DIR -NAMES tbb/tbb.h -HINTS ${TBB_ROOT_DIR}/include) +find_path(TBB_INCLUDE_DIRS + NAMES tbb/tbb.h + HINTS ${TBB_ROOT_DIR}/include) find_library(TBB_LIBRARIES -NAMES tbb -HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH) + NAMES tbb + HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIR) +find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIRS) mark_as_advanced( -TBB_LIBRARIES -TBB_INCLUDE_DIR) + TBB_LIBRARIES + TBB_INCLUDE_DIRS) + +if(TBB_FOUND AND NOT (TARGET TBB::TBB)) + add_library (TBB::TBB UNKNOWN IMPORTED) + set_target_properties(TBB::TBB + PROPERTIES + IMPORTED_LOCATION ${TBB_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${TBB_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/Findbzip2.cmake b/cmake/modules/Findbzip2.cmake deleted file mode 100644 index 87abbe941e0..00000000000 --- a/cmake/modules/Findbzip2.cmake +++ /dev/null @@ -1,21 +0,0 @@ -# - Find Bzip2 -# Find the bzip2 compression library and includes -# -# BZIP2_INCLUDE_DIR - where to find bzlib.h, etc. -# BZIP2_LIBRARIES - List of libraries when using bzip2. -# BZIP2_FOUND - True if bzip2 found. - -find_path(BZIP2_INCLUDE_DIR - NAMES bzlib.h - HINTS ${BZIP2_ROOT_DIR}/include) - -find_library(BZIP2_LIBRARIES - NAMES bz2 - HINTS ${BZIP2_ROOT_DIR}/lib) - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(bzip2 DEFAULT_MSG BZIP2_LIBRARIES BZIP2_INCLUDE_DIR) - -mark_as_advanced( - BZIP2_LIBRARIES - BZIP2_INCLUDE_DIR) diff --git a/cmake/modules/Findlz4.cmake b/cmake/modules/Findlz4.cmake index c34acef5e39..7cf7d7f5fe3 100644 --- a/cmake/modules/Findlz4.cmake +++ b/cmake/modules/Findlz4.cmake @@ -1,21 +1,29 @@ # - Find Lz4 # Find the lz4 compression library and includes # -# LZ4_INCLUDE_DIR - where to find lz4.h, etc. -# LZ4_LIBRARIES - List of libraries when using lz4. -# LZ4_FOUND - True if lz4 found. +# lz4_INCLUDE_DIRS - where to find lz4.h, etc. +# lz4_LIBRARIES - List of libraries when using lz4. +# lz4_FOUND - True if lz4 found. -find_path(LZ4_INCLUDE_DIR +find_path(lz4_INCLUDE_DIRS NAMES lz4.h - HINTS ${LZ4_ROOT_DIR}/include) + HINTS ${lz4_ROOT_DIR}/include) -find_library(LZ4_LIBRARIES +find_library(lz4_LIBRARIES NAMES lz4 - HINTS ${LZ4_ROOT_DIR}/lib) + HINTS ${lz4_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(lz4 DEFAULT_MSG LZ4_LIBRARIES LZ4_INCLUDE_DIR) +find_package_handle_standard_args(lz4 DEFAULT_MSG lz4_LIBRARIES lz4_INCLUDE_DIRS) mark_as_advanced( - LZ4_LIBRARIES - LZ4_INCLUDE_DIR) + lz4_LIBRARIES + lz4_INCLUDE_DIRS) + +if(lz4_FOUND AND NOT (TARGET lz4::lz4)) + add_library(lz4::lz4 UNKNOWN IMPORTED) + set_target_properties(lz4::lz4 + PROPERTIES + IMPORTED_LOCATION ${lz4_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${lz4_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/Findsnappy.cmake b/cmake/modules/Findsnappy.cmake index 6ed5fda3d57..39bba6bd217 100644 --- a/cmake/modules/Findsnappy.cmake +++ b/cmake/modules/Findsnappy.cmake @@ -1,21 +1,29 @@ # - Find Snappy # Find the snappy compression library and includes # -# SNAPPY_INCLUDE_DIR - where to find snappy.h, etc. -# SNAPPY_LIBRARIES - List of libraries when using snappy. -# SNAPPY_FOUND - True if snappy found. +# snappy_INCLUDE_DIRS - where to find snappy.h, etc. +# snappy_LIBRARIES - List of libraries when using snappy. +# snappy_FOUND - True if snappy found. -find_path(SNAPPY_INCLUDE_DIR +find_path(snappy_INCLUDE_DIRS NAMES snappy.h - HINTS ${SNAPPY_ROOT_DIR}/include) + HINTS ${snappy_ROOT_DIR}/include) -find_library(SNAPPY_LIBRARIES +find_library(snappy_LIBRARIES NAMES snappy - HINTS ${SNAPPY_ROOT_DIR}/lib) + HINTS ${snappy_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(snappy DEFAULT_MSG SNAPPY_LIBRARIES SNAPPY_INCLUDE_DIR) +find_package_handle_standard_args(snappy DEFAULT_MSG snappy_LIBRARIES snappy_INCLUDE_DIRS) mark_as_advanced( - SNAPPY_LIBRARIES - SNAPPY_INCLUDE_DIR) + snappy_LIBRARIES + snappy_INCLUDE_DIRS) + +if(snappy_FOUND AND NOT (TARGET snappy::snappy)) + add_library (snappy::snappy UNKNOWN IMPORTED) + set_target_properties(snappy::snappy + PROPERTIES + IMPORTED_LOCATION ${snappy_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${snappy_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/Findzstd.cmake b/cmake/modules/Findzstd.cmake index a2964aa9f80..9430821df6e 100644 --- a/cmake/modules/Findzstd.cmake +++ b/cmake/modules/Findzstd.cmake @@ -1,21 +1,29 @@ # - Find zstd # Find the zstd compression library and includes # -# ZSTD_INCLUDE_DIR - where to find zstd.h, etc. -# ZSTD_LIBRARIES - List of libraries when using zstd. -# ZSTD_FOUND - True if zstd found. +# zstd_INCLUDE_DIRS - where to find zstd.h, etc. +# zstd_LIBRARIES - List of libraries when using zstd. +# zstd_FOUND - True if zstd found. -find_path(ZSTD_INCLUDE_DIR +find_path(zstd_INCLUDE_DIRS NAMES zstd.h - HINTS ${ZSTD_ROOT_DIR}/include) + HINTS ${zstd_ROOT_DIR}/include) -find_library(ZSTD_LIBRARIES +find_library(zstd_LIBRARIES NAMES zstd - HINTS ${ZSTD_ROOT_DIR}/lib) + HINTS ${zstd_ROOT_DIR}/lib) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(zstd DEFAULT_MSG ZSTD_LIBRARIES ZSTD_INCLUDE_DIR) +find_package_handle_standard_args(zstd DEFAULT_MSG zstd_LIBRARIES zstd_INCLUDE_DIRS) mark_as_advanced( - ZSTD_LIBRARIES - ZSTD_INCLUDE_DIR) + zstd_LIBRARIES + zstd_INCLUDE_DIRS) + +if(zstd_FOUND AND NOT (TARGET zstd::zstd)) + add_library (zstd::zstd UNKNOWN IMPORTED) + set_target_properties(zstd::zstd + PROPERTIES + IMPORTED_LOCATION ${zstd_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${zstd_INCLUDE_DIRS}) +endif() diff --git a/cmake/modules/ReadVersion.cmake b/cmake/modules/ReadVersion.cmake new file mode 100644 index 00000000000..ebfd7d6f949 --- /dev/null +++ b/cmake/modules/ReadVersion.cmake @@ -0,0 +1,10 @@ +# Read rocksdb version from version.h header file. + +function(get_rocksdb_version version_var) + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/include/rocksdb/version.h" version_header_file) + foreach(component MAJOR MINOR PATCH) + string(REGEX MATCH "#define ROCKSDB_${component} ([0-9]+)" _ ${version_header_file}) + set(ROCKSDB_VERSION_${component} ${CMAKE_MATCH_1}) + endforeach() + set(${version_var} "${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH}" PARENT_SCOPE) +endfunction() diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py index fbdabd96839..a5e98722202 100644 --- a/coverage/parse_gcov_output.py +++ b/coverage/parse_gcov_output.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python2 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import re import sys diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc new file mode 100644 index 00000000000..840c20e9e4c --- /dev/null +++ b/db/arena_wrapped_db_iter.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/arena_wrapped_db_iter.h" +#include "memory/arena.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "table/internal_iterator.h" +#include "table/iterator_wrapper.h" +#include "util/user_comparator_wrapper.h" + +namespace rocksdb { + +Status ArenaWrappedDBIter::GetProperty(std::string prop_name, + std::string* prop) { + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + if (!db_iter_->GetProperty(prop_name, prop).ok()) { + *prop = ToString(sv_number_); + } + return Status::OK(); + } + return db_iter_->GetProperty(prop_name, prop); +} + +void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iteration, + uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool allow_blob, + bool allow_refresh) { + auto mem = arena_.AllocateAligned(sizeof(DBIter)); + db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options, + cf_options.user_comparator, nullptr, sequence, + true, max_sequential_skip_in_iteration, + read_callback, db_impl, cfd, allow_blob); + sv_number_ = version_number; + allow_refresh_ = allow_refresh; +} + +Status ArenaWrappedDBIter::Refresh() { + if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) { + return Status::NotSupported("Creating renew iterator is not allowed."); + } + assert(db_iter_ != nullptr); + // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the + // correct behavior. Will be corrected automatically when we take a snapshot + // here for the case of WritePreparedTxnDB. + SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + uint64_t cur_sv_number = cfd_->GetSuperVersionNumber(); + if (sv_number_ != cur_sv_number) { + Env* env = db_iter_->env(); + db_iter_->~DBIter(); + arena_.~Arena(); + new (&arena_) Arena(); + + SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex()); + if (read_callback_) { + read_callback_->Refresh(latest_seq); + } + Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, + latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, + cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_, + allow_refresh_); + + InternalIterator* internal_iter = db_impl_->NewInternalIterator( + read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(), + latest_seq); + SetIterUnderDBIter(internal_iter); + } else { + db_iter_->set_sequence(latest_seq); + db_iter_->set_valid(false); + } + return Status::OK(); +} + +ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool allow_blob, bool allow_refresh) { + ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); + iter->Init(env, read_options, cf_options, mutable_cf_options, sequence, + max_sequential_skip_in_iterations, version_number, read_callback, + db_impl, cfd, allow_blob, allow_refresh); + if (db_impl != nullptr && cfd != nullptr && allow_refresh) { + iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback, + allow_blob); + } + + return iter; +} + +} // namespace rocksdb diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h new file mode 100644 index 00000000000..6dbd64521be --- /dev/null +++ b/db/arena_wrapped_db_iter.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "db/db_impl/db_impl.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/range_del_aggregator.h" +#include "memory/arena.h" +#include "options/cf_options.h" +#include "rocksdb/db.h" +#include "rocksdb/iterator.h" +#include "util/autovector.h" + +namespace rocksdb { + +class Arena; + +// A wrapper iterator which wraps DB Iterator and the arena, with which the DB +// iterator is supposed to be allocated. This class is used as an entry point of +// a iterator hierarchy whose memory can be allocated inline. In that way, +// accessing the iterator tree can be more cache friendly. It is also faster +// to allocate. +// When using the class's Iterator interface, the behavior is exactly +// the same as the inner DBIter. +class ArenaWrappedDBIter : public Iterator { + public: + virtual ~ArenaWrappedDBIter() { db_iter_->~DBIter(); } + + // Get the arena to be used to allocate memory for DBIter to be wrapped, + // as well as child iterators in it. + virtual Arena* GetArena() { return &arena_; } + virtual ReadRangeDelAggregator* GetRangeDelAggregator() { + return db_iter_->GetRangeDelAggregator(); + } + + // Set the internal iterator wrapped inside the DB Iterator. Usually it is + // a merging iterator. + virtual void SetIterUnderDBIter(InternalIterator* iter) { + static_cast(db_iter_)->SetIter(iter); + } + + virtual bool Valid() const override { return db_iter_->Valid(); } + virtual void SeekToFirst() override { db_iter_->SeekToFirst(); } + virtual void SeekToLast() override { db_iter_->SeekToLast(); } + virtual void Seek(const Slice& target) override { db_iter_->Seek(target); } + virtual void SeekForPrev(const Slice& target) override { + db_iter_->SeekForPrev(target); + } + virtual void Next() override { db_iter_->Next(); } + virtual void Prev() override { db_iter_->Prev(); } + virtual Slice key() const override { return db_iter_->key(); } + virtual Slice value() const override { return db_iter_->value(); } + virtual Status status() const override { return db_iter_->status(); } + bool IsBlob() const { return db_iter_->IsBlob(); } + + virtual Status GetProperty(std::string prop_name, std::string* prop) override; + + virtual Status Refresh() override; + + void Init(Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool allow_blob, bool allow_refresh); + + // Store some parameters so we can refresh the iterator at a later point + // with these same params + void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl, + ColumnFamilyData* cfd, ReadCallback* read_callback, + bool allow_blob) { + read_options_ = read_options; + db_impl_ = db_impl; + cfd_ = cfd; + read_callback_ = read_callback; + allow_blob_ = allow_blob; + } + + private: + DBIter* db_iter_; + Arena arena_; + uint64_t sv_number_; + ColumnFamilyData* cfd_ = nullptr; + DBImpl* db_impl_ = nullptr; + ReadOptions read_options_; + ReadCallback* read_callback_; + bool allow_blob_ = false; + bool allow_refresh_ = true; +}; + +// Generate the arena wrapped iterator class. +// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not +// be supported. +extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( + Env* env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, + ReadCallback* read_callback, DBImpl* db_impl = nullptr, + ColumnFamilyData* cfd = nullptr, bool allow_blob = false, + bool allow_refresh = true); +} // namespace rocksdb diff --git a/utilities/blob_db/blob_index.h b/db/blob_index.h similarity index 92% rename from utilities/blob_db/blob_index.h rename to db/blob_index.h index fd91b547a84..e1d41e27410 100644 --- a/utilities/blob_db/blob_index.h +++ b/db/blob_index.h @@ -5,12 +5,14 @@ #pragma once #ifndef ROCKSDB_LITE +#include +#include + #include "rocksdb/options.h" #include "util/coding.h" #include "util/string_util.h" namespace rocksdb { -namespace blob_db { // BlobIndex is a pointer to the blob and metadata of the blob. The index is // stored in base DB as ValueType::kTypeBlobIndex. @@ -109,6 +111,23 @@ class BlobIndex { return Status::OK(); } + std::string DebugString(bool output_hex) { + std::ostringstream oss; + + if (IsInlined()) { + oss << "[inlined blob] value:" << value_.ToString(output_hex); + } else { + oss << "[blob ref] file:" << file_number_ << " offset:" << offset_ + << " size:" << size_; + } + + if (HasTTL()) { + oss << " exp:" << expiration_; + } + + return oss.str(); + } + static void EncodeInlinedTTL(std::string* dst, uint64_t expiration, const Slice& value) { assert(dst != nullptr); @@ -156,6 +175,5 @@ class BlobIndex { CompressionType compression_ = kNoCompression; }; -} // namespace blob_db } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/db/builder.cc b/db/builder.cc index 7f2fd72a191..e5d08a0725f 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -13,7 +13,7 @@ #include #include -#include "db/compaction_iterator.h" +#include "db/compaction/compaction_iterator.h" #include "db/dbformat.h" #include "db/event_helpers.h" #include "db/internal_stats.h" @@ -21,6 +21,9 @@ #include "db/range_del_aggregator.h" #include "db/table_cache.h" #include "db/version_edit.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_util.h" #include "rocksdb/db.h" @@ -28,13 +31,11 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "table/block_based_table_builder.h" +#include "table/block_based/block_based_table_builder.h" #include "table/format.h" #include "table/internal_iterator.h" -#include "util/file_reader_writer.h" -#include "util/filename.h" +#include "test_util/sync_point.h" #include "util/stop_watch.h" -#include "util/sync_point.h" namespace rocksdb { @@ -123,7 +124,7 @@ Status BuildTable( if (!s.ok()) { EventHelpers::LogAndNotifyTableFileCreationFinished( event_logger, ioptions.listeners, dbname, column_family_name, fname, - job_id, meta->fd, tp, reason, s); + job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s); return s; } file->SetIOPriority(io_priority); @@ -156,8 +157,9 @@ Status BuildTable( for (; c_iter.Valid(); c_iter.Next()) { const Slice& key = c_iter.key(); const Slice& value = c_iter.value(); + const ParsedInternalKey& ikey = c_iter.ikey(); builder->Add(key, value); - meta->UpdateBoundaries(key, c_iter.ikey().sequence); + meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); // TODO(noetzli): Update stats after flush, too. if (io_priority == Env::IO_HIGH && @@ -221,8 +223,9 @@ Status BuildTable( mutable_cf_options.prefix_extractor.get(), nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), - false /* for_compaction */, nullptr /* arena */, - false /* skip_filter */, level)); + TableReaderCaller::kFlush, /*arena=*/nullptr, + /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key*/ nullptr)); s = it->status(); if (s.ok() && paranoid_file_checks) { for (it->SeekToFirst(); it->Valid(); it->Next()) { @@ -241,10 +244,13 @@ Status BuildTable( env->DeleteFile(fname); } + if (meta->fd.GetFileSize() == 0) { + fname = "(nil)"; + } // Output to event logger and fire events. EventHelpers::LogAndNotifyTableFileCreationFinished( event_logger, ioptions.listeners, dbname, column_family_name, fname, - job_id, meta->fd, tp, reason, s); + job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s); return s; } diff --git a/db/builder.h b/db/builder.h index 34a4bff1a25..4fa56f50e34 100644 --- a/db/builder.h +++ b/db/builder.h @@ -11,6 +11,7 @@ #include #include "db/range_tombstone_fragmenter.h" #include "db/table_properties_collector.h" +#include "logging/event_logger.h" #include "options/cf_options.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" @@ -20,7 +21,6 @@ #include "rocksdb/table_properties.h" #include "rocksdb/types.h" #include "table/scoped_arena_iterator.h" -#include "util/event_logger.h" namespace rocksdb { diff --git a/db/c.cc b/db/c.cc index aac1cf4087c..76007e9175a 100644 --- a/db/c.cc +++ b/db/c.cc @@ -517,6 +517,21 @@ rocksdb_t* rocksdb_open_for_read_only( return result; } +rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options, + const char* name, + const char* secondary_path, + char** errptr) { + DB* db; + if (SaveError(errptr, + DB::OpenAsSecondary(options->rep, std::string(name), + std::string(secondary_path), &db))) { + return nullptr; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + rocksdb_backup_engine_t* rocksdb_backup_engine_open( const rocksdb_options_t* options, const char* path, char** errptr) { BackupEngine* be; @@ -717,6 +732,37 @@ rocksdb_t* rocksdb_open_for_read_only_column_families( return result; } +rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* db_options, const char* name, + const char* secondary_path, int num_column_families, + const char** column_family_names, + const rocksdb_options_t** column_family_options, + rocksdb_column_family_handle_t** column_family_handles, char** errptr) { + std::vector column_families; + for (int i = 0; i != num_column_families; ++i) { + column_families.emplace_back( + std::string(column_family_names[i]), + ColumnFamilyOptions(column_family_options[i]->rep)); + } + DB* db; + std::vector handles; + if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep), + std::string(name), + std::string(secondary_path), + column_families, &handles, &db))) { + return nullptr; + } + for (size_t i = 0; i != handles.size(); ++i) { + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; + c_handle->rep = handles[i]; + column_family_handles[i] = c_handle; + } + rocksdb_t* result = new rocksdb_t; + result->rep = db; + return result; +} + char** rocksdb_list_column_families( const rocksdb_options_t* options, const char* name, @@ -988,7 +1034,7 @@ void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) { rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) { rocksdb_writebatch_t* result = rocksdb_writebatch_create(); BatchResult wal_batch = iter->rep->GetBatch(); - result->rep = * wal_batch.writeBatchPtr.release(); + result->rep = std::move(*wal_batch.writeBatchPtr); if (seq != nullptr) { *seq = wal_batch.sequence; } @@ -2226,11 +2272,6 @@ void rocksdb_options_set_max_bytes_for_level_base( opt->rep.max_bytes_for_level_base = n; } -void rocksdb_options_set_snap_refresh_nanos(rocksdb_options_t* opt, - uint64_t n) { - opt->rep.snap_refresh_nanos = n; -} - void rocksdb_options_set_level_compaction_dynamic_level_bytes( rocksdb_options_t* opt, unsigned char v) { opt->rep.level_compaction_dynamic_level_bytes = v; @@ -2468,11 +2509,21 @@ void rocksdb_options_set_max_write_buffer_number_to_maintain( opt->rep.max_write_buffer_number_to_maintain = n; } +void rocksdb_options_set_max_write_buffer_size_to_maintain( + rocksdb_options_t* opt, int64_t n) { + opt->rep.max_write_buffer_size_to_maintain = n; +} + void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt, unsigned char v) { opt->rep.enable_pipelined_write = v; } +void rocksdb_options_set_unordered_write(rocksdb_options_t* opt, + unsigned char v) { + opt->rep.unordered_write = v; +} + void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt, uint32_t n) { opt->rep.max_subcompactions = n; @@ -3168,6 +3219,11 @@ void rocksdb_writeoptions_set_low_pri( opt->rep.low_pri = v; } +void rocksdb_writeoptions_set_memtable_insert_hint_per_batch( + rocksdb_writeoptions_t* opt, unsigned char v) { + opt->rep.memtable_insert_hint_per_batch = v; +} + rocksdb_compactoptions_t* rocksdb_compactoptions_create() { return new rocksdb_compactoptions_t; } @@ -3268,6 +3324,22 @@ void rocksdb_env_join_all_threads(rocksdb_env_t* env) { env->rep->WaitForJoin(); } +void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolIOPriority(Env::HIGH); +} + +void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(); +} + +void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) { + env->rep->LowerThreadPoolCPUPriority(Env::HIGH); +} + void rocksdb_env_destroy(rocksdb_env_t* env) { if (!env->is_default) delete env->rep; delete env; @@ -3402,6 +3474,10 @@ void rocksdb_ingest_external_file_cf( SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep)); } +void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->TryCatchUpWithPrimary()); +} + rocksdb_slicetransform_t* rocksdb_slicetransform_create( void* state, void (*destructor)(void*), diff --git a/db/c_test.c b/db/c_test.c index 64241df287b..e851aad53f2 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -45,6 +45,7 @@ static char sstfilename[200]; static char dbbackupname[200]; static char dbcheckpointname[200]; static char dbpathname[200]; +static char secondary_path[200]; static void StartPhase(const char* name) { fprintf(stderr, "=== Test %s\n", name); @@ -1466,6 +1467,7 @@ int main(int argc, char** argv) { CheckCondition(!rocksdb_iter_valid(iter)); rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); } } @@ -1722,6 +1724,59 @@ int main(int argc, char** argv) { CheckNoError(err); } + // Check that secondary instance works. + StartPhase("open_as_secondary"); + { + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + + rocksdb_options_t* db_options = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(db_options, 1); + db = rocksdb_open(db_options, dbname, &err); + CheckNoError(err); + rocksdb_t* db1; + rocksdb_options_t* opts = rocksdb_options_create(); + rocksdb_options_set_max_open_files(opts, -1); + rocksdb_options_set_create_if_missing(opts, 1); + snprintf(secondary_path, sizeof(secondary_path), + "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid())); + db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err); + CheckNoError(err); + + rocksdb_writeoptions_set_sync(woptions, 0); + rocksdb_writeoptions_disable_WAL(woptions, 1); + rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err); + CheckNoError(err); + rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create(); + rocksdb_flushoptions_set_wait(flush_opts, 1); + rocksdb_flush(db, flush_opts, &err); + CheckNoError(err); + rocksdb_try_catch_up_with_primary(db1, &err); + CheckNoError(err); + rocksdb_readoptions_t* ropts = rocksdb_readoptions_create(); + rocksdb_readoptions_set_verify_checksums(ropts, 1); + rocksdb_readoptions_set_snapshot(ropts, NULL); + CheckGet(db, ropts, "key0", "value0"); + CheckGet(db1, ropts, "key0", "value0"); + + rocksdb_writeoptions_disable_WAL(woptions, 0); + rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err); + CheckNoError(err); + rocksdb_try_catch_up_with_primary(db1, &err); + CheckNoError(err); + CheckGet(db1, ropts, "key0", "value0"); + CheckGet(db1, ropts, "key1", "value1"); + + rocksdb_close(db1); + rocksdb_destroy_db(opts, secondary_path, &err); + CheckNoError(err); + + rocksdb_options_destroy(db_options); + rocksdb_options_destroy(opts); + rocksdb_readoptions_destroy(ropts); + rocksdb_flushoptions_destroy(flush_opts); + } + // Simple sanity check that options setting db_paths work. StartPhase("open_db_paths"); { diff --git a/db/column_family.cc b/db/column_family.cc index 4592c945f2e..f66759818e8 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -9,34 +9,32 @@ #include "db/column_family.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include -#include -#include #include +#include #include +#include +#include -#include "db/compaction_picker.h" -#include "db/compaction_picker_fifo.h" -#include "db/compaction_picker_universal.h" -#include "db/db_impl.h" +#include "db/compaction/compaction_picker.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" +#include "db/db_impl/db_impl.h" #include "db/internal_stats.h" #include "db/job_context.h" #include "db/range_del_aggregator.h" #include "db/table_properties_collector.h" #include "db/version_set.h" #include "db/write_controller.h" +#include "file/sst_file_manager_impl.h" #include "memtable/hash_skiplist_rep.h" #include "monitoring/thread_status_util.h" #include "options/options_helper.h" -#include "table/block_based_table_factory.h" +#include "port/port.h" +#include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "util/autovector.h" #include "util/compression.h" -#include "util/sst_file_manager_impl.h" namespace rocksdb { @@ -190,6 +188,11 @@ Status CheckCFPathsSupported(const DBOptions& db_options, return Status::OK(); } +namespace { +const uint64_t kDefaultTtl = 0xfffffffffffffffe; +const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe; +}; // namespace + ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, const ColumnFamilyOptions& src) { ColumnFamilyOptions result = src; @@ -230,7 +233,14 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, if (result.max_write_buffer_number < 2) { result.max_write_buffer_number = 2; } - if (result.max_write_buffer_number_to_maintain < 0) { + // fall back max_write_buffer_number_to_maintain if + // max_write_buffer_size_to_maintain is not set + if (result.max_write_buffer_size_to_maintain < 0) { + result.max_write_buffer_size_to_maintain = + result.max_write_buffer_number * + static_cast(result.write_buffer_size); + } else if (result.max_write_buffer_size_to_maintain == 0 && + result.max_write_buffer_number_to_maintain < 0) { result.max_write_buffer_number_to_maintain = result.max_write_buffer_number; } // bloom filter size shouldn't exceed 1/4 of memtable size. @@ -338,6 +348,61 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.max_compaction_bytes = result.target_file_size_base * 25; } + bool is_block_based_table = + (result.table_factory->Name() == BlockBasedTableFactory().Name()); + + const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60; + if (result.ttl == kDefaultTtl) { + if (is_block_based_table && + result.compaction_style != kCompactionStyleFIFO) { + result.ttl = kAdjustedTtl; + } else { + result.ttl = 0; + } + } + + const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60; + + // Turn on periodic compactions and set them to occur once every 30 days if + // compaction filters are used and periodic_compaction_seconds is set to the + // default value. + if (result.compaction_style != kCompactionStyleFIFO) { + if ((result.compaction_filter != nullptr || + result.compaction_filter_factory != nullptr) && + result.periodic_compaction_seconds == kDefaultPeriodicCompSecs && + is_block_based_table) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + } else { + // result.compaction_style == kCompactionStyleFIFO + if (result.ttl == 0) { + if (is_block_based_table) { + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + result.ttl = result.periodic_compaction_seconds; + } + } else if (result.periodic_compaction_seconds != 0) { + result.ttl = std::min(result.ttl, result.periodic_compaction_seconds); + } + } + + // TTL compactions would work similar to Periodic Compactions in Universal in + // most of the cases. So, if ttl is set, execute the periodic compaction + // codepath. + if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) { + if (result.periodic_compaction_seconds != 0) { + result.periodic_compaction_seconds = + std::min(result.ttl, result.periodic_compaction_seconds); + } else { + result.periodic_compaction_seconds = result.ttl; + } + } + + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { + result.periodic_compaction_seconds = 0; + } + return result; } @@ -408,7 +473,8 @@ ColumnFamilyData::ColumnFamilyData( uint32_t id, const std::string& name, Version* _dummy_versions, Cache* _table_cache, WriteBufferManager* write_buffer_manager, const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options, - const EnvOptions& env_options, ColumnFamilySet* column_family_set) + const EnvOptions& env_options, ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer) : id_(id), name_(name), dummy_versions_(_dummy_versions), @@ -425,7 +491,8 @@ ColumnFamilyData::ColumnFamilyData( write_buffer_manager_(write_buffer_manager), mem_(nullptr), imm_(ioptions_.min_write_buffer_number_to_merge, - ioptions_.max_write_buffer_number_to_maintain), + ioptions_.max_write_buffer_number_to_maintain, + ioptions_.max_write_buffer_size_to_maintain), super_version_(nullptr), super_version_number_(0), local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), @@ -448,7 +515,8 @@ ColumnFamilyData::ColumnFamilyData( if (_dummy_versions != nullptr) { internal_stats_.reset( new InternalStats(ioptions_.num_levels, db_options.env, this)); - table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache)); + table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache, + block_cache_tracer)); if (ioptions_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( new LevelCompactionPicker(ioptions_, &internal_comparator_)); @@ -922,8 +990,12 @@ bool ColumnFamilyData::NeedsCompaction() const { Compaction* ColumnFamilyData::PickCompaction( const MutableCFOptions& mutable_options, LogBuffer* log_buffer) { + SequenceNumber earliest_mem_seqno = + std::min(mem_->GetEarliestSequenceNumber(), + imm_.current()->GetEarliestSequenceNumber(false)); auto* result = compaction_picker_->PickCompaction( - GetName(), mutable_options, current_->storage_info(), log_buffer); + GetName(), mutable_options, current_->storage_info(), log_buffer, + earliest_mem_seqno); if (result != nullptr) { result->SetInputVersion(current_); } @@ -1147,13 +1219,51 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() { } } +Status ColumnFamilyData::ValidateOptions( + const DBOptions& db_options, const ColumnFamilyOptions& cf_options) { + Status s; + s = CheckCompressionSupported(cf_options); + if (s.ok() && db_options.allow_concurrent_memtable_write) { + s = CheckConcurrentWritesSupported(cf_options); + } + if (s.ok()) { + s = CheckCFPathsSupported(db_options, cf_options); + } + if (!s.ok()) { + return s; + } + + if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) { + if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) { + return Status::NotSupported( + "TTL is only supported in Block-Based Table format. "); + } + } + + if (cf_options.periodic_compaction_seconds > 0 && + cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) { + if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) { + return Status::NotSupported( + "Periodic Compaction is only supported in " + "Block-Based Table format. "); + } + } + return s; +} + #ifndef ROCKSDB_LITE Status ColumnFamilyData::SetOptions( - const std::unordered_map& options_map) { + const DBOptions& db_options, + const std::unordered_map& options_map) { MutableCFOptions new_mutable_cf_options; Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map, ioptions_.info_log, &new_mutable_cf_options); + if (s.ok()) { + ColumnFamilyOptions cf_options = + BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options); + s = ValidateOptions(db_options, cf_options); + } if (s.ok()) { mutable_cf_options_ = new_mutable_cf_options; mutable_cf_options_.RefreshDerivedOptions(ioptions_); @@ -1210,18 +1320,20 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const EnvOptions& env_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller) + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer) : max_column_family_(0), - dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr, - ColumnFamilyOptions(), *db_options, - env_options, nullptr)), + dummy_cfd_(new ColumnFamilyData( + 0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options, + env_options, nullptr, block_cache_tracer)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), env_options_(env_options), table_cache_(table_cache), write_buffer_manager_(write_buffer_manager), - write_controller_(write_controller) { + write_controller_(write_controller), + block_cache_tracer_(block_cache_tracer) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; dummy_cfd_->next_ = dummy_cfd_; @@ -1289,7 +1401,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData( id, name, dummy_versions, table_cache_, write_buffer_manager_, options, - *db_options_, env_options_, this); + *db_options_, env_options_, this, block_cache_tracer_); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); max_column_family_ = std::max(max_column_family_, id); diff --git a/db/column_family.h b/db/column_family.h index 7a1ae85bfd3..135504ea21b 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -24,6 +24,7 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/options.h" +#include "trace_replay/block_cache_tracer.h" #include "util/thread_local.h" namespace rocksdb { @@ -45,6 +46,112 @@ class InstrumentedMutexLock; struct SuperVersionContext; extern const double kIncSlowdownRatio; +// This file contains a list of data structures for managing column family +// level metadata. +// +// The basic relationships among classes declared here are illustrated as +// following: +// +// +----------------------+ +----------------------+ +--------+ +// +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | +// | +----------------------+ | +----------------------+ +----+---+ +// | +--------------------------+ | +// | | +-----------------------------+ +// | | | +// | | +-----------------------------v-------------------------------+ +// | | | | +// | | | ColumnFamilySet | +// | | | | +// | | +-------------+--------------------------+----------------+---+ +// | | | | | +// | +-------------------------------------+ | | +// | | | | v +// | +-------------v-------------+ +-----v----v---------+ +// | | | | | +// | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... +// | | | | | +// +---> | | | +// | +---------+ | | +// | | MemTable| | | +// | | List | | | +// +--------+---+--+-+----+----+ +--------------------++ +// | | | | +// | | | | +// | | | +-----------------------+ +// | | +-----------+ | +// v +--------+ | | +// +--------+--------+ | | | +// | | | | +----------v----------+ +// +---> |SuperVersion 1.a +-----------------> | +// | +------+ | | MemTableListVersion | +// +---+-------------+ | | | | | +// | | | | +----+------------+---+ +// | current | | | | | +// | +-------------+ | |mem | | +// | | | | | | +// +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ +// | | | | | | | | +// | Version 1.a | | memtable | | memtable | | memtable | +// | | | 1.a | | 1.b | | 1.c | +// +-------------+ | | | | | | +// +----------+ +----------+ +----------+ +// +// DBImpl keeps a ColumnFamilySet, which references to all column families by +// pointing to respective ColumnFamilyData object of each column family. +// This is how DBImpl can list and operate on all the column families. +// ColumnFamilyHandle also points to ColumnFamilyData directly, so that +// when a user executes a query, it can directly find memtables and Version +// as well as SuperVersion to the column family, without going through +// ColumnFamilySet. +// +// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables +// and SST files) indirectly, while ongoing operations may hold references +// to a current or an out-of-date SuperVersion, which in turn points to a +// point-in-time view of the LSM-tree. This guarantees the memtables and SST +// files being operated on will not go away, until the SuperVersion is +// unreferenced to 0 and destoryed. +// +// The following graph illustrates a possible referencing relationships: +// +// Column +--------------+ current +-----------+ +// Family +---->+ +------------------->+ | +// Data | SuperVersion +----------+ | Version A | +// | 3 | imm | | | +// Iter2 +----->+ | +-------v------+ +-----------+ +// +-----+--------+ | MemtableList +----------------> Empty +// | | Version r | +-----------+ +// | +--------------+ | | +// +------------------+ current| Version B | +// +--------------+ | +----->+ | +// | | | | +-----+-----+ +// Compaction +>+ SuperVersion +-------------+ ^ +// Job | 2 +------+ | |current +// | +----+ | | mem | +------------+ +// +--------------+ | | +---------------------> | +// | +------------------------> MemTable a | +// | mem | | | +// +--------------+ | | +------------+ +// | +--------------------------+ +// Iter1 +-----> SuperVersion | | +------------+ +// | 1 +------------------------------>+ | +// | +-+ | mem | MemTable b | +// +--------------+ | | | | +// | | +--------------+ +-----^------+ +// | |imm | MemtableList | | +// | +--->+ Version s +------------+ +// | +--------------+ +// | +--------------+ +// | | MemtableList | +// +------>+ Version t +--------> Empty +// imm +--------------+ +// +// In this example, even if the current LSM-tree consists of Version A and +// memtable a, which is also referenced by SuperVersion, two older SuperVersion +// SuperVersion2 and Superversion1 still exist, and are referenced by a +// compaction job and an old iterator Iter1, respectively. SuperVersion2 +// contains Version B, memtable a and memtable b; SuperVersion1 contains +// Version B and memtable b (mutable). As a result, Version B and memtable b +// are prevented from being destroyed or deleted. // ColumnFamilyHandleImpl is the class that clients use to access different // column families. It has non-trivial destructor, which gets called when client @@ -168,7 +275,7 @@ class ColumnFamilyData { // Ref() can only be called from a context where the caller can guarantee // that ColumnFamilyData is alive (while holding a non-zero ref already, // holding a DB mutex, or as the leader in a write batch group). - void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); } + void Ref() { refs_.fetch_add(1); } // Unref decreases the reference count, but does not handle deletion // when the count goes to 0. If this method returns true then the @@ -176,7 +283,7 @@ class ColumnFamilyData { // FreeDeadColumnFamilies(). Unref() can only be called while holding // a DB mutex, or during single-threaded recovery. bool Unref() { - int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed); + int old_refs = refs_.fetch_sub(1); assert(old_refs > 0); return old_refs == 1; } @@ -232,9 +339,13 @@ class ColumnFamilyData { bool is_delete_range_supported() { return is_delete_range_supported_; } + // Validate CF options against DB options + static Status ValidateOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options); #ifndef ROCKSDB_LITE // REQUIRES: DB mutex held Status SetOptions( + const DBOptions& db_options, const std::unordered_map& options_map); #endif // ROCKSDB_LITE @@ -394,7 +505,8 @@ class ColumnFamilyData { const ColumnFamilyOptions& options, const ImmutableDBOptions& db_options, const EnvOptions& env_options, - ColumnFamilySet* column_family_set); + ColumnFamilySet* column_family_set, + BlockCacheTracer* const block_cache_tracer); uint32_t id_; const std::string name_; @@ -522,7 +634,8 @@ class ColumnFamilySet { const ImmutableDBOptions* db_options, const EnvOptions& env_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller); + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -581,6 +694,7 @@ class ColumnFamilySet { Cache* table_cache_; WriteBufferManager* write_buffer_manager_; WriteController* write_controller_; + BlockCacheTracer* const block_cache_tracer_; }; // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access diff --git a/db/column_family_test.cc b/db/column_family_test.cc index bdc832bd235..95c43ac5ae9 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -12,20 +12,22 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "memtable/hash_skiplist_rep.h" #include "options/options_parser.h" #include "port/port.h" +#include "port/stack_trace.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" +#include "rocksdb/utilities/object_registry.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" -#include "util/fault_injection_test_env.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -60,8 +62,20 @@ class EnvCounter : public EnvWrapper { class ColumnFamilyTestBase : public testing::Test { public: - ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) { - env_ = new EnvCounter(Env::Default()); + explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) { + Env* base_env = Env::Default(); +#ifndef ROCKSDB_LITE + const char* test_env_uri = getenv("TEST_ENV_URI"); + if (test_env_uri) { + Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri, + &env_guard_); + base_env = env_guard_.get(); + EXPECT_OK(s); + EXPECT_NE(Env::Default(), base_env); + } +#endif // !ROCKSDB_LITE + EXPECT_NE(nullptr, base_env); + env_ = new EnvCounter(base_env); dbname_ = test::PerThreadDBPath("column_family_test"); db_options_.create_if_missing = true; db_options_.fail_if_options_file_error = true; @@ -532,6 +546,7 @@ class ColumnFamilyTestBase : public testing::Test { std::string dbname_; DB* db_ = nullptr; EnvCounter* env_; + std::shared_ptr env_guard_; Random rnd_; uint32_t format_; }; @@ -1117,22 +1132,25 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) { default_cf.arena_block_size = 4 * 4096; default_cf.max_write_buffer_number = 10; default_cf.min_write_buffer_number_to_merge = 1; - default_cf.max_write_buffer_number_to_maintain = 0; + default_cf.max_write_buffer_size_to_maintain = 0; one.write_buffer_size = 200000; one.arena_block_size = 4 * 4096; one.max_write_buffer_number = 10; one.min_write_buffer_number_to_merge = 2; - one.max_write_buffer_number_to_maintain = 1; + one.max_write_buffer_size_to_maintain = + static_cast(one.write_buffer_size); two.write_buffer_size = 1000000; two.arena_block_size = 4 * 4096; two.max_write_buffer_number = 10; two.min_write_buffer_number_to_merge = 3; - two.max_write_buffer_number_to_maintain = 2; + two.max_write_buffer_size_to_maintain = + static_cast(two.write_buffer_size); three.write_buffer_size = 4096 * 22; three.arena_block_size = 4096; three.max_write_buffer_number = 10; three.min_write_buffer_number_to_merge = 4; - three.max_write_buffer_number_to_maintain = -1; + three.max_write_buffer_size_to_maintain = + static_cast(three.write_buffer_size); Reopen({default_cf, one, two, three}); @@ -3312,7 +3330,17 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) { } // namespace rocksdb +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index ce80375e0e1..4152cb37939 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -10,13 +10,13 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" namespace rocksdb { @@ -387,7 +387,7 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) { auto l0_files_1 = collector->GetFlushedFiles(); CompactionOptions co; co.compression = CompressionType::kLZ4Compression; - CompactionJobInfo compaction_job_info; + CompactionJobInfo compaction_job_info{}; ASSERT_OK( db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info)); ASSERT_EQ(compaction_job_info.base_input_level, 0); diff --git a/db/compacted_db_impl.cc b/db/compacted_db_impl.cc index acdaad4ec29..13cccbd7746 100644 --- a/db/compacted_db_impl.cc +++ b/db/compacted_db_impl.cc @@ -5,7 +5,7 @@ #ifndef ROCKSDB_LITE #include "db/compacted_db_impl.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "table/get_context.h" @@ -37,7 +37,7 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, const Slice& key, PinnableSlice* value) { GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, key, value, nullptr, nullptr, - nullptr, nullptr); + true, nullptr, nullptr); LookupKey lkey(key, kMaxSequenceNumber); files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(), &get_context, nullptr); @@ -70,7 +70,7 @@ std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, std::string& value = (*values)[idx]; GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, keys[idx], &pinnable_val, - nullptr, nullptr, nullptr, nullptr); + nullptr, nullptr, true, nullptr, nullptr); LookupKey lkey(keys[idx], kMaxSequenceNumber); r->Get(options, lkey.internal_key(), &get_context, nullptr); value.assign(pinnable_val.data(), pinnable_val.size()); diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h index 5c574b4b9a5..8a57c5b77eb 100644 --- a/db/compacted_db_impl.h +++ b/db/compacted_db_impl.h @@ -5,15 +5,19 @@ #pragma once #ifndef ROCKSDB_LITE -#include "db/db_impl.h" -#include #include +#include +#include "db/db_impl/db_impl.h" namespace rocksdb { class CompactedDBImpl : public DBImpl { public: CompactedDBImpl(const DBOptions& options, const std::string& dbname); + // No copying allowed + CompactedDBImpl(const CompactedDBImpl&) = delete; + void operator=(const CompactedDBImpl&) = delete; + virtual ~CompactedDBImpl(); static Status Open(const Options& options, const std::string& dbname, @@ -85,6 +89,15 @@ class CompactedDBImpl : public DBImpl { const IngestExternalFileOptions& /*ingestion_options*/) override { return Status::NotSupported("Not supported in compacted db mode."); } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } private: friend class DB; @@ -95,10 +108,6 @@ class CompactedDBImpl : public DBImpl { Version* version_; const Comparator* user_comparator_; LevelFilesBrief files_; - - // No copying allowed - CompactedDBImpl(const CompactedDBImpl&); - void operator=(const CompactedDBImpl&); }; } #endif // ROCKSDB_LITE diff --git a/db/compaction.cc b/db/compaction/compaction.cc similarity index 97% rename from db/compaction.cc rename to db/compaction/compaction.cc index f8805376f1d..d83bb719704 100644 --- a/db/compaction.cc +++ b/db/compaction/compaction.cc @@ -7,19 +7,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction.h" - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "db/column_family.h" +#include "db/compaction/compaction.h" #include "rocksdb/compaction_filter.h" +#include "test_util/sync_point.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { @@ -550,17 +545,16 @@ bool Compaction::ShouldFormSubcompactions() const { } } -uint64_t Compaction::MaxInputFileCreationTime() const { - uint64_t max_creation_time = 0; +uint64_t Compaction::MinInputFileOldestAncesterTime() const { + uint64_t min_oldest_ancester_time = port::kMaxUint64; for (const auto& file : inputs_[0].files) { - if (file->fd.table_reader != nullptr && - file->fd.table_reader->GetTableProperties() != nullptr) { - uint64_t creation_time = - file->fd.table_reader->GetTableProperties()->creation_time; - max_creation_time = std::max(max_creation_time, creation_time); + uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0) { + min_oldest_ancester_time = + std::min(min_oldest_ancester_time, oldest_ancester_time); } } - return max_creation_time; + return min_oldest_ancester_time; } int Compaction::GetInputBaseLevel() const { diff --git a/db/compaction.h b/db/compaction/compaction.h similarity index 98% rename from db/compaction.h rename to db/compaction/compaction.h index 2cf737b676a..dec5e607e1a 100644 --- a/db/compaction.h +++ b/db/compaction/compaction.h @@ -9,11 +9,13 @@ #pragma once #include "db/version_set.h" +#include "memory/arena.h" #include "options/cf_options.h" -#include "util/arena.h" #include "util/autovector.h" namespace rocksdb { +// The file contains class Compaction, as well as some helper functions +// and data structures used by the class. // Utility for comparing sstable boundary keys. Returns -1 if either a or b is // null which provides the property that a==null indicates a key that is less @@ -63,7 +65,7 @@ class ColumnFamilyData; class VersionStorageInfo; class CompactionFilter; -// A Compaction encapsulates information about a compaction. +// A Compaction encapsulates metadata about a compaction. class Compaction { public: Compaction(VersionStorageInfo* input_version, @@ -289,7 +291,7 @@ class Compaction { uint32_t max_subcompactions() const { return max_subcompactions_; } - uint64_t MaxInputFileCreationTime() const; + uint64_t MinInputFileOldestAncesterTime() const; private: // mark (or clear) all files that are being compacted @@ -376,7 +378,7 @@ class Compaction { CompactionReason compaction_reason_; }; -// Utility function +// Return sum of sizes of all files in `files`. extern uint64_t TotalFileSize(const std::vector& files); } // namespace rocksdb diff --git a/db/compaction_iteration_stats.h b/db/compaction/compaction_iteration_stats.h similarity index 100% rename from db/compaction_iteration_stats.h rename to db/compaction/compaction_iteration_stats.h diff --git a/db/compaction_iterator.cc b/db/compaction/compaction_iterator.cc similarity index 91% rename from db/compaction_iterator.cc rename to db/compaction/compaction_iterator.cc index bce0b82dbc7..59097eec0cf 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -3,13 +3,14 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/compaction_iterator.h" +#include +#include "db/compaction/compaction_iterator.h" #include "db/snapshot_checker.h" #include "port/likely.h" #include "rocksdb/listener.h" #include "table/internal_iterator.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" #define DEFINITELY_IN_SNAPSHOT(seq, snapshot) \ ((seq) <= (snapshot) && \ @@ -39,7 +40,8 @@ CompactionIterator::CompactionIterator( const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, - SnapshotListFetchCallback* snap_list_callback) + const std::atomic* manual_compaction_paused, + const std::shared_ptr info_log) : CompactionIterator( input, cmp, merge_helper, last_sequence, snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, @@ -47,7 +49,7 @@ CompactionIterator::CompactionIterator( std::unique_ptr( compaction ? new CompactionProxy(compaction) : nullptr), compaction_filter, shutting_down, preserve_deletes_seqnum, - snap_list_callback) {} + manual_compaction_paused, info_log) {} CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, @@ -60,7 +62,8 @@ CompactionIterator::CompactionIterator( const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum, - SnapshotListFetchCallback* snap_list_callback) + const std::atomic* manual_compaction_paused, + const std::shared_ptr info_log) : input_(input), cmp_(cmp), merge_helper_(merge_helper), @@ -74,12 +77,13 @@ CompactionIterator::CompactionIterator( compaction_(std::move(compaction)), compaction_filter_(compaction_filter), shutting_down_(shutting_down), + manual_compaction_paused_(manual_compaction_paused), preserve_deletes_seqnum_(preserve_deletes_seqnum), current_user_key_sequence_(0), current_user_key_snapshot_(0), merge_out_iter_(merge_helper_), current_key_committed_(false), - snap_list_callback_(snap_list_callback) { + info_log_(info_log) { assert(compaction_filter_ == nullptr || compaction_ != nullptr); assert(snapshots_ != nullptr); bottommost_level_ = @@ -87,7 +91,24 @@ CompactionIterator::CompactionIterator( if (compaction_ != nullptr) { level_ptrs_ = std::vector(compaction_->number_levels(), 0); } - ProcessSnapshotList(); + if (snapshots_->size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip_ = true; + earliest_snapshot_iter_ = snapshots_->end(); + earliest_snapshot_ = kMaxSequenceNumber; + latest_snapshot_ = 0; + } else { + visible_at_tip_ = false; + earliest_snapshot_iter_ = snapshots_->begin(); + earliest_snapshot_ = snapshots_->at(0); + latest_snapshot_ = snapshots_->back(); + } +#ifndef NDEBUG + // findEarliestVisibleSnapshot assumes this ordering. + for (size_t i = 1; i < snapshots_->size(); ++i) { + assert(snapshots_->at(i - 1) < snapshots_->at(i)); + } +#endif input_->SetPinnedItersMgr(&pinned_iters_mgr_); TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); } @@ -126,6 +147,11 @@ void CompactionIterator::Next() { // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to be valid. assert(valid_key); + if (!valid_key) { + ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", + key_.ToString(true).c_str()); + } + // Keep current_key_ in sync. current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); key_ = current_key_.GetInternalKey(); @@ -209,33 +235,12 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, } } -void CompactionIterator::ProcessSnapshotList() { -#ifndef NDEBUG - // findEarliestVisibleSnapshot assumes this ordering. - for (size_t i = 1; i < snapshots_->size(); ++i) { - assert(snapshots_->at(i - 1) < snapshots_->at(i)); - } -#endif - if (snapshots_->size() == 0) { - // optimize for fast path if there are no snapshots - visible_at_tip_ = true; - earliest_snapshot_iter_ = snapshots_->end(); - earliest_snapshot_ = kMaxSequenceNumber; - latest_snapshot_ = 0; - } else { - visible_at_tip_ = false; - earliest_snapshot_iter_ = snapshots_->begin(); - earliest_snapshot_ = snapshots_->at(0); - latest_snapshot_ = snapshots_->back(); - } - released_snapshots_.clear(); -} - void CompactionIterator::NextFromInput() { at_next_ = false; valid_ = false; - while (!valid_ && input_->Valid() && !IsShuttingDown()) { + while (!valid_ && input_->Valid() && !IsPausingManualCompaction() && + !IsShuttingDown()) { key_ = input_->key(); value_ = input_->value(); iter_stats_.num_input_records++; @@ -278,13 +283,6 @@ void CompactionIterator::NextFromInput() { // compaction filter). ikey_.user_key is pointing to the copy. if (!has_current_user_key_ || !cmp_->Equal(ikey_.user_key, current_user_key_)) { - num_keys_++; - // Use num_keys_ to reduce the overhead of reading current time - if (snap_list_callback_ && snapshots_->size() && - snap_list_callback_->TimeToRefresh(num_keys_)) { - snap_list_callback_->Refresh(snapshots_, latest_snapshot_); - ProcessSnapshotList(); - } // First occurrence of this user key // Copy key for output key_ = current_key_.SetInternalKey(key_, &ikey_); @@ -350,7 +348,18 @@ void CompactionIterator::NextFromInput() { // not compact out. We will keep this Put, but can drop it's data. // (See Optimization 3, below.) assert(ikey_.type == kTypeValue); + if (ikey_.type != kTypeValue) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected key type %d for compaction output", + ikey_.type); + } assert(current_user_key_snapshot_ == last_snapshot); + if (current_user_key_snapshot_ != last_snapshot) { + ROCKS_LOG_FATAL(info_log_, + "current_user_key_snapshot_ (%" PRIu64 + ") != last_snapshot (%" PRIu64 ")", + current_user_key_snapshot_, last_snapshot); + } value_.clear(); valid_ = true; @@ -492,20 +501,12 @@ void CompactionIterator::NextFromInput() { // checking since there has already been a record returned for this key // in this snapshot. assert(last_sequence >= current_user_key_sequence_); - - // Note2: if last_snapshot < current_user_key_snapshot, it can only - // mean last_snapshot is released between we process last value and - // this value, and findEarliestVisibleSnapshot returns the next snapshot - // as current_user_key_snapshot. In this case last value and current - // value are both in current_user_key_snapshot currently. - // Although last_snapshot is released we might still get a definitive - // response when key sequence number changes, e.g., when seq is determined - // too old and visible in all snapshots. - assert(last_snapshot == current_user_key_snapshot_ || - (snapshot_checker_ != nullptr && - snapshot_checker_->CheckInSnapshot(current_user_key_sequence_, - last_snapshot) != - SnapshotCheckerResult::kNotInSnapshot)); + if (last_sequence < current_user_key_sequence_) { + ROCKS_LOG_FATAL(info_log_, + "last_sequence (%" PRIu64 + ") < current_user_key_sequence_ (%" PRIu64 ")", + last_sequence, current_user_key_sequence_); + } ++iter_stats_.num_record_drop_hidden; // (A) input_->Next(); @@ -589,6 +590,10 @@ void CompactionIterator::NextFromInput() { // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to valid. assert(valid_key); + if (!valid_key) { + ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction", + key_.ToString(true).c_str()); + } // Keep current_key_ in sync. current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); key_ = current_key_.GetInternalKey(); @@ -627,6 +632,10 @@ void CompactionIterator::NextFromInput() { if (!valid_ && IsShuttingDown()) { status_ = Status::ShutdownInProgress(); } + + if (IsPausingManualCompaction()) { + status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } } void CompactionIterator::PrepareOutput() { @@ -645,6 +654,11 @@ void CompactionIterator::PrepareOutput() { ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ && valid_ && IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) { assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion); + if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected key type %d for seq-zero optimization", + ikey_.type); + } ikey_.sequence = 0; current_key_.UpdateInternalKey(0, ikey_.type); } @@ -653,6 +667,10 @@ void CompactionIterator::PrepareOutput() { inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( SequenceNumber in, SequenceNumber* prev_snapshot) { assert(snapshots_->size()); + if (snapshots_->size() == 0) { + ROCKS_LOG_FATAL(info_log_, + "No snapshot left in findEarliestVisibleSnapshot"); + } auto snapshots_iter = std::lower_bound( snapshots_->begin(), snapshots_->end(), in); if (snapshots_iter == snapshots_->begin()) { @@ -660,6 +678,10 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( } else { *prev_snapshot = *std::prev(snapshots_iter); assert(*prev_snapshot < in); + if (*prev_snapshot >= in) { + ROCKS_LOG_FATAL(info_log_, + "*prev_snapshot >= in in findEarliestVisibleSnapshot"); + } } if (snapshot_checker_ == nullptr) { return snapshots_iter != snapshots_->end() @@ -669,6 +691,9 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) { auto cur = *snapshots_iter; assert(in <= cur); + if (in > cur) { + ROCKS_LOG_FATAL(info_log_, "in > cur in findEarliestVisibleSnapshot"); + } // Skip if cur is in released_snapshots. if (has_released_snapshot && released_snapshots_.count(cur) > 0) { continue; @@ -693,9 +718,14 @@ inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() { bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) { assert(snapshot_checker_ != nullptr); - assert(earliest_snapshot_ == kMaxSequenceNumber || - (earliest_snapshot_iter_ != snapshots_->end() && - *earliest_snapshot_iter_ == earliest_snapshot_)); + bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber || + (earliest_snapshot_iter_ != snapshots_->end() && + *earliest_snapshot_iter_ == earliest_snapshot_)); + assert(pre_condition); + if (!pre_condition) { + ROCKS_LOG_FATAL(info_log_, + "Pre-Condition is not hold in IsInEarliestSnapshot"); + } auto in_snapshot = snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) { @@ -714,6 +744,10 @@ bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) { snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_); } assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased); + if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) { + ROCKS_LOG_FATAL(info_log_, + "Unexpected released snapshot in IsInEarliestSnapshot"); + } return in_snapshot == SnapshotCheckerResult::kInSnapshot; } diff --git a/db/compaction_iterator.h b/db/compaction/compaction_iterator.h similarity index 66% rename from db/compaction_iterator.h rename to db/compaction/compaction_iterator.h index 6ab43b1becf..1e08b407d28 100644 --- a/db/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -10,8 +10,8 @@ #include #include -#include "db/compaction.h" -#include "db/compaction_iteration_stats.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_iteration_stats.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/range_del_aggregator.h" @@ -21,53 +21,6 @@ namespace rocksdb { -// This callback can be used to refresh the snapshot list from the db. It -// includes logics to exponentially decrease the refresh rate to limit the -// overhead of refresh. -class SnapshotListFetchCallback { - public: - SnapshotListFetchCallback(Env* env, uint64_t snap_refresh_nanos, - size_t every_nth_key = 1024) - : timer_(env, /*auto restart*/ true), - snap_refresh_nanos_(snap_refresh_nanos), - every_nth_key_minus_one_(every_nth_key - 1) { - assert(every_nth_key > 0); - assert((ceil(log2(every_nth_key)) == floor(log2(every_nth_key)))); - } - // Refresh the snapshot list. snapshots will bre replacted with the new list. - // max is the upper bound. Note: this function will acquire the db_mutex_. - virtual void Refresh(std::vector* snapshots, - SequenceNumber max) = 0; - inline bool TimeToRefresh(const size_t key_index) { - // skip the key if key_index % every_nth_key (which is of power 2) is not 0. - if ((key_index & every_nth_key_minus_one_) != 0) { - return false; - } - const uint64_t elapsed = timer_.ElapsedNanos(); - auto ret = elapsed > snap_refresh_nanos_; - // pre-compute the next time threshold - if (ret) { - // inc next refresh period exponentially (by x4) - auto next_refresh_threshold = snap_refresh_nanos_ << 2; - // make sure the shift has not overflown the highest 1 bit - snap_refresh_nanos_ = - std::max(snap_refresh_nanos_, next_refresh_threshold); - } - return ret; - } - static constexpr SnapshotListFetchCallback* kDisabled = nullptr; - - virtual ~SnapshotListFetchCallback() {} - - private: - // Time since the callback was created - StopWatchNano timer_; - // The delay before calling ::Refresh. To be increased exponentially. - uint64_t snap_refresh_nanos_; - // Skip evey nth key. Number n if of power 2. The math will require n-1. - const uint64_t every_nth_key_minus_one_; -}; - class CompactionIterator { public: // A wrapper around Compaction. Has a much smaller interface, only what @@ -106,32 +59,34 @@ class CompactionIterator { const Compaction* compaction_; }; - CompactionIterator(InternalIterator* input, const Comparator* cmp, - MergeHelper* merge_helper, SequenceNumber last_sequence, - std::vector* snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, Env* env, - bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregator* range_del_agg, - const Compaction* compaction = nullptr, - const CompactionFilter* compaction_filter = nullptr, - const std::atomic* shutting_down = nullptr, - const SequenceNumber preserve_deletes_seqnum = 0, - SnapshotListFetchCallback* snap_list_callback = nullptr); + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + const Compaction* compaction = nullptr, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const SequenceNumber preserve_deletes_seqnum = 0, + const std::atomic* manual_compaction_paused = nullptr, + const std::shared_ptr info_log = nullptr); // Constructor with custom CompactionProxy, used for tests. - CompactionIterator(InternalIterator* input, const Comparator* cmp, - MergeHelper* merge_helper, SequenceNumber last_sequence, - std::vector* snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, Env* env, - bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregator* range_del_agg, - std::unique_ptr compaction, - const CompactionFilter* compaction_filter = nullptr, - const std::atomic* shutting_down = nullptr, - const SequenceNumber preserve_deletes_seqnum = 0, - SnapshotListFetchCallback* snap_list_callback = nullptr); + CompactionIterator( + InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, + SequenceNumber last_sequence, std::vector* snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, Env* env, + bool report_detailed_time, bool expect_valid_internal_key, + CompactionRangeDelAggregator* range_del_agg, + std::unique_ptr compaction, + const CompactionFilter* compaction_filter = nullptr, + const std::atomic* shutting_down = nullptr, + const SequenceNumber preserve_deletes_seqnum = 0, + const std::atomic* manual_compaction_paused = nullptr, + const std::shared_ptr info_log = nullptr); ~CompactionIterator(); @@ -159,8 +114,6 @@ class CompactionIterator { private: // Processes the input stream to find the next output void NextFromInput(); - // Process snapshots_ and assign related variables - void ProcessSnapshotList(); // Do last preparations before presenting the output to the callee. At this // point this only zeroes out the sequence number if possible for better @@ -195,7 +148,7 @@ class CompactionIterator { InternalIterator* input_; const Comparator* cmp_; MergeHelper* merge_helper_; - std::vector* snapshots_; + const std::vector* snapshots_; // List of snapshots released during compaction. // findEarliestVisibleSnapshot() find them out from return of // snapshot_checker, and make sure they will not be returned as @@ -212,6 +165,7 @@ class CompactionIterator { std::unique_ptr compaction_; const CompactionFilter* compaction_filter_; const std::atomic* shutting_down_; + const std::atomic* manual_compaction_paused_; const SequenceNumber preserve_deletes_seqnum_; bool bottommost_level_; bool valid_ = false; @@ -270,13 +224,17 @@ class CompactionIterator { // Used to avoid purging uncommitted values. The application can specify // uncommitted values by providing a SnapshotChecker object. bool current_key_committed_; - SnapshotListFetchCallback* snap_list_callback_; - // number of distinct keys processed - size_t num_keys_ = 0; + std::shared_ptr info_log_; bool IsShuttingDown() { // This is a best-effort facility, so memory_order_relaxed is sufficient. return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); } + + bool IsPausingManualCompaction() { + // This is a best-effort facility, so memory_order_relaxed is sufficient. + return manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed); + } }; } // namespace rocksdb diff --git a/db/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc similarity index 99% rename from db/compaction_iterator_test.cc rename to db/compaction/compaction_iterator_test.cc index c466f6c9122..94f297961e2 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -3,15 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/compaction_iterator.h" #include #include +#include "db/compaction/compaction_iterator.h" #include "port/port.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -184,7 +184,7 @@ class TestSnapshotChecker : public SnapshotChecker { public: explicit TestSnapshotChecker( SequenceNumber last_committed_sequence, - const std::unordered_map& snapshots = {}) + const std::unordered_map& snapshots = {{}}) : last_committed_sequence_(last_committed_sequence), snapshots_(snapshots) {} diff --git a/db/compaction_job.cc b/db/compaction/compaction_job.cc similarity index 94% rename from db/compaction_job.cc rename to db/compaction/compaction_job.cc index bc127a4c45c..22c504fde26 100644 --- a/db/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -7,14 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_job.h" - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include +#include #include #include #include @@ -25,7 +19,8 @@ #include #include "db/builder.h" -#include "db/db_impl.h" +#include "db/compaction/compaction_job.h" +#include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" #include "db/error_handler.h" @@ -38,6 +33,12 @@ #include "db/merge_helper.h" #include "db/range_del_aggregator.h" #include "db/version_set.h" +#include "file/filename.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" @@ -47,21 +48,16 @@ #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" +#include "test_util/sync_point.h" #include "util/coding.h" -#include "util/file_reader_writer.h" -#include "util/filename.h" -#include "util/log_buffer.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/random.h" -#include "util/sst_file_manager_impl.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { @@ -315,7 +311,7 @@ CompactionJob::CompactionJob( const SnapshotChecker* snapshot_checker, std::shared_ptr table_cache, EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname, CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback) + Env::Priority thread_pri, const std::atomic* manual_compaction_paused) : job_id_(job_id), compact_(new CompactionState(compaction)), compaction_job_stats_(compaction_job_stats), @@ -324,10 +320,11 @@ CompactionJob::CompactionJob( db_options_(db_options), env_options_(env_options), env_(db_options.env), - env_optiosn_for_read_( + env_options_for_read_( env_->OptimizeForCompactionTableRead(env_options, db_options_)), versions_(versions), shutting_down_(shutting_down), + manual_compaction_paused_(manual_compaction_paused), preserve_deletes_seqnum_(preserve_deletes_seqnum), log_buffer_(log_buffer), db_directory_(db_directory), @@ -336,7 +333,6 @@ CompactionJob::CompactionJob( db_mutex_(db_mutex), db_error_handler_(db_error_handler), existing_snapshots_(std::move(existing_snapshots)), - snap_list_callback_(snap_list_callback), earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), snapshot_checker_(snapshot_checker), table_cache_(std::move(table_cache)), @@ -415,7 +411,6 @@ void CompactionJob::Prepare() { write_hint_ = c->column_family_data()->CalculateSSTWriteHint(c->output_level()); - // Is this compaction producing files at the bottommost level? bottommost_level_ = c->bottommost_level(); if (c->ShouldFormSubcompactions()) { @@ -445,11 +440,6 @@ struct RangeWithSize { : range(a, b), size(s) {} }; -// Generates a histogram representing potential divisions of key ranges from -// the input. It adds the starting and/or ending keys of certain input files -// to the working set and then finds the approximate size of data in between -// each consecutive pair of slices. Then it divides these ranges into -// consecutive groups such that each group has a similar size. void CompactionJob::GenSubcompactionBoundaries() { auto* c = compact_->compaction; auto* cfd = c->column_family_data(); @@ -519,7 +509,7 @@ void CompactionJob::GenSubcompactionBoundaries() { auto* v = compact_->compaction->input_version(); for (auto it = bounds.begin();;) { const Slice a = *it; - it++; + ++it; if (it == bounds.end()) { break; @@ -531,7 +521,9 @@ void CompactionJob::GenSubcompactionBoundaries() { // to the index block and may incur I/O cost in the process. Unlock db // mutex to reduce contention db_mutex_->Unlock(); - uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1); + uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a, + b, start_lvl, out_lvl + 1, + TableReaderCaller::kCompaction); db_mutex_->Lock(); ranges.emplace_back(a, b, size); sum += size; @@ -656,12 +648,14 @@ Status CompactionJob::Run() { // to cache it here for further user reads InternalIterator* iter = cfd->table_cache()->NewIterator( ReadOptions(), env_options_, cfd->internal_comparator(), - *files_meta[file_idx], nullptr /* range_del_agg */, - prefix_extractor, nullptr, + *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor, + /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), - false, nullptr /* arena */, false /* skip_filters */, - compact_->compaction->output_level()); + TableReaderCaller::kCompactionRefill, /*arena=*/nullptr, + /*skip_filters=*/false, compact_->compaction->output_level(), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { @@ -842,7 +836,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // Although the v2 aggregator is what the level iterator(s) know about, // the AddTombstones calls will be propagated down to the v1 aggregator. std::unique_ptr input(versions_->MakeInputIterator( - sub_compact->compaction, &range_del_agg, env_optiosn_for_read_)); + sub_compact->compaction, &range_del_agg, env_options_for_read_)); AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_PROCESS_KV); @@ -873,9 +867,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { false /* internal key corruption is expected */, existing_snapshots_.empty() ? 0 : existing_snapshots_.back(), snapshot_checker_, compact_->compaction->level(), - db_options_.statistics.get(), shutting_down_); + db_options_.statistics.get()); TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:1", + reinterpret_cast( + const_cast*>(manual_compaction_paused_))); Slice* start = sub_compact->start; Slice* end = sub_compact->end; @@ -893,7 +891,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { &existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false, &range_del_agg, sub_compact->compaction, compaction_filter, - shutting_down_, preserve_deletes_seqnum_, snap_list_callback_)); + shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_, + db_options_.info_log)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { @@ -935,8 +934,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { assert(sub_compact->current_output() != nullptr); sub_compact->builder->Add(key, value); sub_compact->current_output_file_size = sub_compact->builder->FileSize(); + const ParsedInternalKey& ikey = c_iter->ikey(); sub_compact->current_output()->meta.UpdateBoundaries( - key, c_iter->ikey().sequence); + key, value, ikey.sequence, ikey.type); sub_compact->num_output_records++; // Close output file if it is big enough. Two possibilities determine it's @@ -957,7 +957,14 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { input_status = input->status(); output_file_ended = true; } + TEST_SYNC_POINT_CALLBACK( + "CompactionJob::Run():PausingManualCompaction:2", + reinterpret_cast( + const_cast*>(manual_compaction_paused_))); c_iter->Next(); + if (c_iter->status().IsManualCompactionPaused()) { + break; + } if (!output_file_ended && c_iter->Valid() && sub_compact->compaction->output_level() != 0 && sub_compact->ShouldStopBefore(c_iter->key(), @@ -1002,10 +1009,18 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); RecordCompactionIOStats(); - if (status.ok() && - (shutting_down_->load(std::memory_order_relaxed) || cfd->IsDropped())) { - status = Status::ShutdownInProgress( - "Database shutdown or Column family drop during compaction"); + if (status.ok() && cfd->IsDropped()) { + status = + Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_relaxed)) { + status = Status::ShutdownInProgress("Database shutdown"); + } + if ((status.ok() || status.IsColumnFamilyDropped()) && + (manual_compaction_paused_ && + manual_compaction_paused_->load(std::memory_order_relaxed))) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); } if (status.ok()) { status = input->status(); @@ -1336,17 +1351,20 @@ Status CompactionJob::FinishCompactionOutputFile( } std::string fname; FileDescriptor output_fd; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; if (meta != nullptr) { fname = TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); output_fd = meta->fd; + oldest_blob_file_number = meta->oldest_blob_file_number; } else { fname = "(nil)"; } EventHelpers::LogAndNotifyTableFileCreationFinished( event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, - job_id_, output_fd, tp, TableFileCreationReason::kCompaction, s); + job_id_, output_fd, oldest_blob_file_number, tp, + TableFileCreationReason::kCompaction, s); #ifndef ROCKSDB_LITE // Report new file to SstFileManagerImpl @@ -1456,17 +1474,38 @@ Status CompactionJob::OpenCompactionOutputFile( LogFlush(db_options_.info_log); EventHelpers::LogAndNotifyTableFileCreationFinished( event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), - fname, job_id_, FileDescriptor(), TableProperties(), - TableFileCreationReason::kCompaction, s); + fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber, + TableProperties(), TableFileCreationReason::kCompaction, s); return s; } - SubcompactionState::Output out; - out.meta.fd = - FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0); - out.finished = false; + // Try to figure out the output file's oldest ancester time. + int64_t temp_current_time = 0; + auto get_time_status = env_->GetCurrentTime(&temp_current_time); + // Safe to proceed even if GetCurrentTime fails. So, log and proceed. + if (!get_time_status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time. Status: %s", + get_time_status.ToString().c_str()); + } + uint64_t current_time = static_cast(temp_current_time); + uint64_t oldest_ancester_time = + sub_compact->compaction->MinInputFileOldestAncesterTime(); + if (oldest_ancester_time == port::kMaxUint64) { + oldest_ancester_time = current_time; + } + + // Initialize a SubcompactionState::Output and add it to sub_compact->outputs + { + SubcompactionState::Output out; + out.meta.fd = FileDescriptor(file_number, + sub_compact->compaction->output_path_id(), 0); + out.meta.oldest_ancester_time = oldest_ancester_time; + out.meta.file_creation_time = current_time; + out.finished = false; + sub_compact->outputs.push_back(out); + } - sub_compact->outputs.push_back(out); writable_file->SetIOPriority(Env::IO_LOW); writable_file->SetWriteLifeTimeHint(write_hint_); writable_file->SetPreallocationBlockSize(static_cast( @@ -1483,22 +1522,6 @@ Status CompactionJob::OpenCompactionOutputFile( bool skip_filters = cfd->ioptions()->optimize_filters_for_hits && bottommost_level_; - int64_t temp_current_time = 0; - auto get_time_status = env_->GetCurrentTime(&temp_current_time); - // Safe to proceed even if GetCurrentTime fails. So, log and proceed. - if (!get_time_status.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "Failed to get current time. Status: %s", - get_time_status.ToString().c_str()); - } - uint64_t current_time = static_cast(temp_current_time); - - uint64_t latest_key_time = - sub_compact->compaction->MaxInputFileCreationTime(); - if (latest_key_time == 0) { - latest_key_time = current_time; - } - sub_compact->builder.reset(NewTableBuilder( *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), @@ -1506,9 +1529,9 @@ Status CompactionJob::OpenCompactionOutputFile( sub_compact->compaction->output_compression(), 0 /*sample_for_compression */, sub_compact->compaction->output_compression_opts(), - sub_compact->compaction->output_level(), skip_filters, latest_key_time, - 0 /* oldest_key_time */, sub_compact->compaction->max_output_file_size(), - current_time)); + sub_compact->compaction->output_level(), skip_filters, + oldest_ancester_time, 0 /* oldest_key_time */, + sub_compact->compaction->max_output_file_size(), current_time)); LogFlush(db_options_.info_log); return s; } diff --git a/db/compaction_job.h b/db/compaction/compaction_job.h similarity index 68% rename from db/compaction_job.h rename to db/compaction/compaction_job.h index b3a0f2eb4b5..bca4c994265 100644 --- a/db/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -18,7 +18,7 @@ #include #include "db/column_family.h" -#include "db/compaction_iterator.h" +#include "db/compaction/compaction_iterator.h" #include "db/dbformat.h" #include "db/flush_scheduler.h" #include "db/internal_stats.h" @@ -29,6 +29,7 @@ #include "db/version_edit.h" #include "db/write_controller.h" #include "db/write_thread.h" +#include "logging/event_logger.h" #include "options/cf_options.h" #include "options/db_options.h" #include "port/port.h" @@ -40,7 +41,6 @@ #include "rocksdb/transaction_log.h" #include "table/scoped_arena_iterator.h" #include "util/autovector.h" -#include "util/event_logger.h" #include "util/stop_watch.h" #include "util/thread_local.h" @@ -55,22 +55,30 @@ class Version; class VersionEdit; class VersionSet; +// CompactionJob is responsible for executing the compaction. Each (manual or +// automated) compaction corresponds to a CompactionJob object, and usually +// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob +// will divide the compaction into subcompactions and execute them in parallel +// if needed. class CompactionJob { public: - CompactionJob( - int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, - const EnvOptions env_options, VersionSet* versions, - const std::atomic* shutting_down, - const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer, - Directory* db_directory, Directory* output_directory, Statistics* stats, - InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, - std::vector existing_snapshots, - SequenceNumber earliest_write_conflict_snapshot, - const SnapshotChecker* snapshot_checker, - std::shared_ptr table_cache, EventLogger* event_logger, - bool paranoid_file_checks, bool measure_io_stats, - const std::string& dbname, CompactionJobStats* compaction_job_stats, - Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback); + CompactionJob(int job_id, Compaction* compaction, + const ImmutableDBOptions& db_options, + const EnvOptions env_options, VersionSet* versions, + const std::atomic* shutting_down, + const SequenceNumber preserve_deletes_seqnum, + LogBuffer* log_buffer, Directory* db_directory, + Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler, + std::vector existing_snapshots, + SequenceNumber earliest_write_conflict_snapshot, + const SnapshotChecker* snapshot_checker, + std::shared_ptr table_cache, EventLogger* event_logger, + bool paranoid_file_checks, bool measure_io_stats, + const std::string& dbname, + CompactionJobStats* compaction_job_stats, + Env::Priority thread_pri, + const std::atomic* manual_compaction_paused = nullptr); ~CompactionJob(); @@ -80,17 +88,28 @@ class CompactionJob { CompactionJob& operator=(const CompactionJob& job) = delete; // REQUIRED: mutex held + // Prepare for the compaction by setting up boundaries for each subcompaction void Prepare(); // REQUIRED mutex not held + // Launch threads for each subcompaction and wait for them to finish. After + // that, verify table is usable and finally do bookkeeping to unify + // subcompaction results Status Run(); // REQUIRED: mutex held + // Add compaction input/output to the current version Status Install(const MutableCFOptions& mutable_cf_options); private: struct SubcompactionState; void AggregateStatistics(); + + // Generates a histogram representing potential divisions of key ranges from + // the input. It adds the starting and/or ending keys of certain input files + // to the working set and then finds the approximate size of data in between + // each consecutive pair of slices. Then it divides these ranges into + // consecutive groups such that each group has a similar size. void GenSubcompactionBoundaries(); // update the thread status for starting a compaction. @@ -135,9 +154,10 @@ class CompactionJob { Env* env_; // env_option optimized for compaction table reads - EnvOptions env_optiosn_for_read_; + EnvOptions env_options_for_read_; VersionSet* versions_; const std::atomic* shutting_down_; + const std::atomic* manual_compaction_paused_; const SequenceNumber preserve_deletes_seqnum_; LogBuffer* log_buffer_; Directory* db_directory_; @@ -150,7 +170,6 @@ class CompactionJob { // entirely within s1 and s2, then the earlier version of k1 can be safely // deleted because that version is not visible in any snapshot. std::vector existing_snapshots_; - SnapshotListFetchCallback* snap_list_callback_; // This is the earliest snapshot that could be used for write-conflict // checking by a transaction. For any user-key newer than this snapshot, we @@ -163,6 +182,7 @@ class CompactionJob { EventLogger* event_logger_; + // Is this compaction creating a file in the bottom most level? bool bottommost_level_; bool paranoid_file_checks_; bool measure_io_stats_; diff --git a/db/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc similarity index 99% rename from db/compaction_job_stats_test.cc rename to db/compaction/compaction_job_stats_test.cc index 48e883bc6cc..f25a38ec65e 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -7,12 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include +#include #include #include #include @@ -21,12 +17,14 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/job_context.h" #include "db/version_set.h" #include "db/write_batch_internal.h" #include "env/mock_env.h" +#include "file/filename.h" +#include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "monitoring/thread_status_util.h" @@ -47,20 +45,18 @@ #include "rocksdb/thread_status.h" #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/write_batch_with_index.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/compression.h" -#include "util/filename.h" #include "util/hash.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" #if !defined(IOS_CROSS_COMPILE) diff --git a/db/compaction_job_test.cc b/db/compaction/compaction_job_test.cc similarity index 86% rename from db/compaction_job_test.cc rename to db/compaction/compaction_job_test.cc index 60394cc9735..9fb3f0df5dc 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -5,30 +5,28 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include #include +#include #include #include #include +#include "db/blob_index.h" #include "db/column_family.h" -#include "db/compaction_job.h" +#include "db/compaction/compaction_job.h" +#include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "db/version_set.h" +#include "file/writable_file_writer.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" -#include "util/file_reader_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -81,7 +79,8 @@ class CompactionJobTest : public testing::Test { write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)), + &write_controller_, + /*block_cache_tracer=*/nullptr)), shutting_down_(false), preserve_deletes_seqnum_(0), mock_table_factory_(new mock::MockTableFactory()), @@ -99,11 +98,34 @@ class CompactionJobTest : public testing::Test { return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); } - std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num, - const ValueType t) { + static std::string KeyStr(const std::string& user_key, + const SequenceNumber seq_num, const ValueType t) { return InternalKey(user_key, seq_num, t).Encode().ToString(); } + static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, + uint64_t size) { + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + return blob_index; + } + + static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset, + uint64_t size, uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset, + size, kNoCompression); + return blob_index; + } + + static std::string BlobStrInlinedTTL(const Slice& value, + uint64_t expiration) { + std::string blob_index; + BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value); + return blob_index; + } + void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) { assert(contents.size() > 0); @@ -112,12 +134,13 @@ class CompactionJobTest : public testing::Test { InternalKey smallest_key, largest_key; SequenceNumber smallest_seqno = kMaxSequenceNumber; SequenceNumber largest_seqno = 0; + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; for (auto kv : contents) { ParsedInternalKey key; std::string skey; std::string value; std::tie(skey, value) = kv; - ParseInternalKey(skey, &key); + bool parsed = ParseInternalKey(skey, &key); smallest_seqno = std::min(smallest_seqno, key.sequence); largest_seqno = std::max(largest_seqno, key.sequence); @@ -134,6 +157,24 @@ class CompactionJobTest : public testing::Test { } first_key = false; + + if (parsed && key.type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + continue; + } + + if (blob_index.IsInlined() || blob_index.HasTTL() || + blob_index.file_number() == kInvalidBlobFileNumber) { + continue; + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } } uint64_t file_number = versions_->NewFileNumber(); @@ -142,7 +183,8 @@ class CompactionJobTest : public testing::Test { VersionEdit edit; edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key, - smallest_seqno, largest_seqno, false); + smallest_seqno, largest_seqno, false, oldest_blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); mutex_.Lock(); versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), @@ -204,10 +246,18 @@ class CompactionJobTest : public testing::Test { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)); + &write_controller_, + /*block_cache_tracer=*/nullptr)); compaction_job_stats_.Reset(); + SetIdentityFile(env_, dbname_); VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBImpl* impl = new DBImpl(DBOptions(), dbname_); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); @@ -245,8 +295,7 @@ class CompactionJobTest : public testing::Test { const std::vector& snapshots = {}, SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber, int output_level = 1, bool verify = true, - SnapshotListFetchCallback* snapshot_fetcher = - SnapshotListFetchCallback::kDisabled) { + uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) { auto cfd = versions_->GetColumnFamilySet()->GetDefault(); size_t num_input_files = 0; @@ -279,7 +328,7 @@ class CompactionJobTest : public testing::Test { nullptr, nullptr, &mutex_, &error_handler_, snapshots, earliest_write_conflict_snapshot, snapshot_checker, table_cache_, &event_logger, false, false, dbname_, &compaction_job_stats_, - Env::Priority::USER, snapshot_fetcher); + Env::Priority::USER); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); compaction_job.Prepare(); @@ -292,15 +341,20 @@ class CompactionJobTest : public testing::Test { mutex_.Unlock(); if (verify) { - if (expected_results.size() == 0) { - ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); - ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); + ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); + + if (expected_results.empty()) { ASSERT_EQ(compaction_job_stats_.num_output_files, 0U); } else { - ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); - ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); ASSERT_EQ(compaction_job_stats_.num_output_files, 1U); mock_table_factory_->AssertLatestFile(expected_results); + + auto output_files = + cfd->current()->storage_info()->LevelFiles(output_level); + ASSERT_EQ(output_files.size(), 1); + ASSERT_EQ(output_files[0]->oldest_blob_file_number, + expected_oldest_blob_file_number); } } } @@ -956,103 +1010,52 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) { RunCompaction({files}, expected_results); } -// Test the snapshot fetcher in compaction -TEST_F(CompactionJobTest, SnapshotRefresh) { - uint64_t time_seed = env_->NowMicros(); - printf("time_seed is %" PRIu64 "\n", time_seed); // would help to reproduce - Random64 rand(time_seed); - std::vector db_snapshots; - class SnapshotListFetchCallbackTest : public SnapshotListFetchCallback { - public: - SnapshotListFetchCallbackTest(Env* env, Random64& rand, - std::vector* snapshots) - : SnapshotListFetchCallback(env, 0 /*no time delay*/, - 1 /*fetch after each key*/), - rand_(rand), - snapshots_(snapshots) {} - virtual void Refresh(std::vector* snapshots, - SequenceNumber) override { - assert(snapshots->size()); - assert(snapshots_->size()); - assert(snapshots_->size() == snapshots->size()); - if (rand_.OneIn(2)) { - uint64_t release_index = rand_.Uniform(snapshots_->size()); - snapshots_->erase(snapshots_->begin() + release_index); - *snapshots = *snapshots_; - } - } - - private: - Random64 rand_; - std::vector* snapshots_; - } snapshot_fetcher(env_, rand, &db_snapshots); - - std::vector> file1_kvs, file2_kvs; - std::array types = {kTypeValue, kTypeDeletion, - kTypeSingleDeletion}; - SequenceNumber last_seq = 0; - for (int i = 1; i < 100; i++) { - SequenceNumber seq = last_seq + 1; - last_seq = seq; - if (rand.OneIn(2)) { - auto type = types[rand.Uniform(types.size())]; - file1_kvs.push_back( - {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)}); - } - } - auto file1 = mock::MakeMockFile(file1_kvs); - for (int i = 1; i < 100; i++) { - SequenceNumber seq = last_seq + 1; - last_seq++; - if (rand.OneIn(2)) { - auto type = types[rand.Uniform(types.size())]; - file2_kvs.push_back( - {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)}); - } - } - auto file2 = mock::MakeMockFile(file2_kvs); - for (SequenceNumber i = 1; i < last_seq + 1; i++) { - if (rand.OneIn(5)) { - db_snapshots.push_back(i); - } - } - - const bool kVerify = true; - const int output_level_0 = 0; +TEST_F(CompactionJobTest, OldestBlobFileNumber) { NewDB(); - AddMockFile(file1); - AddMockFile(file2); - SetLastSequence(last_seq); - auto files = cfd_->current()->storage_info()->LevelFiles(0); - // put the output on L0 since it is easier to feed them again to the 2nd - // compaction - RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber, - output_level_0, !kVerify, &snapshot_fetcher); - - // Now db_snapshots are changed. Run the compaction again without snapshot - // fetcher but with the updated snapshot list. - compaction_job_stats_.Reset(); - files = cfd_->current()->storage_info()->LevelFiles(0); - RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber, - output_level_0 + 1, !kVerify); - // The result should be what we get if we run compaction without snapshot - // fetcher on the updated list of snapshots - auto expected = mock_table_factory_->output(); - NewDB(); + // Note: blob1 is inlined TTL, so it will not be considered for the purposes + // of identifying the oldest referenced blob file. Similarly, blob6 will be + // ignored because it has TTL and hence refers to a TTL blob file. + const stl_wrappers::KVMap::value_type blob1( + KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL)); + const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex), + BlobStr(59, 123456, 999)); + const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex), + BlobStr(138, 1000, 1 << 8)); + auto file1 = mock::MakeMockFile({blob1, blob2, blob3}); AddMockFile(file1); + + const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex), + BlobStr(199, 3 << 10, 1 << 20)); + const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex), + BlobStr(19, 6789, 333)); + const stl_wrappers::KVMap::value_type blob6( + KeyStr("f", 6U, kTypeBlobIndex), + BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL)); + auto file2 = mock::MakeMockFile({blob4, blob5, blob6}); AddMockFile(file2); - SetLastSequence(last_seq); - files = cfd_->current()->storage_info()->LevelFiles(0); - RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber, - output_level_0, !kVerify); - // The 2nd compaction above would get rid of useless delete markers. To get - // the output here exactly as what we got above after two compactions, we also - // run the compaction for 2nd time. - compaction_job_stats_.Reset(); - files = cfd_->current()->storage_info()->LevelFiles(0); - RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber, - output_level_0 + 1, !kVerify); + + const stl_wrappers::KVMap::value_type expected_blob1( + KeyStr("a", 0U, kTypeBlobIndex), blob1.second); + const stl_wrappers::KVMap::value_type expected_blob2( + KeyStr("b", 0U, kTypeBlobIndex), blob2.second); + const stl_wrappers::KVMap::value_type expected_blob3( + KeyStr("c", 0U, kTypeBlobIndex), blob3.second); + const stl_wrappers::KVMap::value_type expected_blob4( + KeyStr("d", 0U, kTypeBlobIndex), blob4.second); + const stl_wrappers::KVMap::value_type expected_blob5( + KeyStr("e", 0U, kTypeBlobIndex), blob5.second); + const stl_wrappers::KVMap::value_type expected_blob6( + KeyStr("f", 0U, kTypeBlobIndex), blob6.second); + auto expected_results = + mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3, + expected_blob4, expected_blob5, expected_blob6}); + + SetLastSequence(6U); + auto files = cfd_->current()->storage_info()->LevelFiles(0); + RunCompaction({files}, expected_results, std::vector(), + kMaxSequenceNumber, /* output_level */ 1, /* verify */ true, + /* expected_oldest_blob_file_number */ 19); } } // namespace rocksdb diff --git a/db/compaction_picker.cc b/db/compaction/compaction_picker.cc similarity index 67% rename from db/compaction_picker.cc rename to db/compaction/compaction_picker.cc index d6d7b69876e..0461ff32d1c 100644 --- a/db/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -7,25 +7,21 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_picker.h" +#include "db/compaction/compaction_picker.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include #include #include #include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" #include "monitoring/statistics.h" -#include "util/filename.h" -#include "util/log_buffer.h" +#include "test_util/sync_point.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { @@ -42,29 +38,54 @@ uint64_t TotalCompensatedFileSize(const std::vector& files) { bool FindIntraL0Compaction(const std::vector& level_files, size_t min_files_to_compact, uint64_t max_compact_bytes_per_del_file, - CompactionInputFiles* comp_inputs) { - size_t compact_bytes = static_cast(level_files[0]->fd.file_size); + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno) { + // Do not pick ingested file when there is at least one memtable not flushed + // which of seqno is overlap with the sst. + TEST_SYNC_POINT("FindIntraL0Compaction"); + size_t start = 0; + for (; start < level_files.size(); start++) { + if (level_files[start]->being_compacted) { + return false; + } + // If there is no data in memtable, the earliest sequence number would the + // largest sequence number in last memtable. + // Because all files are sorted in descending order by largest_seqno, so we + // only need to check the first one. + if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) { + break; + } + } + if (start >= level_files.size()) { + return false; + } + size_t compact_bytes = static_cast(level_files[start]->fd.file_size); + uint64_t compensated_compact_bytes = + level_files[start]->compensated_file_size; size_t compact_bytes_per_del_file = port::kMaxSizet; - // compaction range will be [0, span_len). - size_t span_len; - // pull in files until the amount of compaction work per deleted file begins - // increasing. + // Compaction range will be [start, limit). + size_t limit; + // Pull in files until the amount of compaction work per deleted file begins + // increasing or maximum total compaction size is reached. size_t new_compact_bytes_per_del_file = 0; - for (span_len = 1; span_len < level_files.size(); ++span_len) { - compact_bytes += static_cast(level_files[span_len]->fd.file_size); - new_compact_bytes_per_del_file = compact_bytes / span_len; - if (level_files[span_len]->being_compacted || - new_compact_bytes_per_del_file > compact_bytes_per_del_file) { + for (limit = start + 1; limit < level_files.size(); ++limit) { + compact_bytes += static_cast(level_files[limit]->fd.file_size); + compensated_compact_bytes += level_files[limit]->compensated_file_size; + new_compact_bytes_per_del_file = compact_bytes / (limit - start); + if (level_files[limit]->being_compacted || + new_compact_bytes_per_del_file > compact_bytes_per_del_file || + compensated_compact_bytes > max_compaction_bytes) { break; } compact_bytes_per_del_file = new_compact_bytes_per_del_file; } - if (span_len >= min_files_to_compact && + if ((limit - start) >= min_files_to_compact && compact_bytes_per_del_file < max_compact_bytes_per_del_file) { assert(comp_inputs != nullptr); comp_inputs->level = 0; - for (size_t i = 0; i < span_len; ++i) { + for (size_t i = start; i < limit; ++i) { comp_inputs->files.push_back(level_files[i]); } return true; @@ -1107,536 +1128,4 @@ bool CompactionPicker::GetOverlappingL0Files( return true; } -bool LevelCompactionPicker::NeedsCompaction( - const VersionStorageInfo* vstorage) const { - if (!vstorage->ExpiredTtlFiles().empty()) { - return true; - } - if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { - return true; - } - if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { - return true; - } - if (!vstorage->FilesMarkedForCompaction().empty()) { - return true; - } - for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { - if (vstorage->CompactionScore(i) >= 1) { - return true; - } - } - return false; -} - -namespace { -// A class to build a leveled compaction step-by-step. -class LevelCompactionBuilder { - public: - LevelCompactionBuilder(const std::string& cf_name, - VersionStorageInfo* vstorage, - CompactionPicker* compaction_picker, - LogBuffer* log_buffer, - const MutableCFOptions& mutable_cf_options, - const ImmutableCFOptions& ioptions) - : cf_name_(cf_name), - vstorage_(vstorage), - compaction_picker_(compaction_picker), - log_buffer_(log_buffer), - mutable_cf_options_(mutable_cf_options), - ioptions_(ioptions) {} - - // Pick and return a compaction. - Compaction* PickCompaction(); - - // Pick the initial files to compact to the next level. (or together - // in Intra-L0 compactions) - void SetupInitialFiles(); - - // If the initial files are from L0 level, pick other L0 - // files if needed. - bool SetupOtherL0FilesIfNeeded(); - - // Based on initial files, setup other files need to be compacted - // in this compaction, accordingly. - bool SetupOtherInputsIfNeeded(); - - Compaction* GetCompaction(); - - // For the specfied level, pick a file that we want to compact. - // Returns false if there is no file to compact. - // If it returns true, inputs->files.size() will be exactly one. - // If level is 0 and there is already a compaction on that level, this - // function will return false. - bool PickFileToCompact(); - - // For L0->L0, picks the longest span of files that aren't currently - // undergoing compaction for which work-per-deleted-file decreases. The span - // always starts from the newest L0 file. - // - // Intra-L0 compaction is independent of all other files, so it can be - // performed even when L0->base_level compactions are blocked. - // - // Returns true if `inputs` is populated with a span of files to be compacted; - // otherwise, returns false. - bool PickIntraL0Compaction(); - - void PickExpiredTtlFiles(); - - void PickFilesMarkedForPeriodicCompaction(); - - const std::string& cf_name_; - VersionStorageInfo* vstorage_; - CompactionPicker* compaction_picker_; - LogBuffer* log_buffer_; - int start_level_ = -1; - int output_level_ = -1; - int parent_index_ = -1; - int base_index_ = -1; - double start_level_score_ = 0; - bool is_manual_ = false; - CompactionInputFiles start_level_inputs_; - std::vector compaction_inputs_; - CompactionInputFiles output_level_inputs_; - std::vector grandparents_; - CompactionReason compaction_reason_ = CompactionReason::kUnknown; - - const MutableCFOptions& mutable_cf_options_; - const ImmutableCFOptions& ioptions_; - // Pick a path ID to place a newly generated file, with its level - static uint32_t GetPathId(const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options, - int level); - - static const int kMinFilesForIntraL0Compaction = 4; -}; - -void LevelCompactionBuilder::PickExpiredTtlFiles() { - if (vstorage_->ExpiredTtlFiles().empty()) { - return; - } - - auto continuation = [&](std::pair level_file) { - // If it's being compacted it has nothing to do here. - // If this assert() fails that means that some function marked some - // files as being_compacted, but didn't call ComputeCompactionScore() - assert(!level_file.second->being_compacted); - start_level_ = level_file.first; - output_level_ = - (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; - - if ((start_level_ == vstorage_->num_non_empty_levels() - 1) || - (start_level_ == 0 && - !compaction_picker_->level0_compactions_in_progress()->empty())) { - return false; - } - - start_level_inputs_.files = {level_file.second}; - start_level_inputs_.level = start_level_; - return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_); - }; - - for (auto& level_file : vstorage_->ExpiredTtlFiles()) { - if (continuation(level_file)) { - // found the compaction! - return; - } - } - - start_level_inputs_.files.clear(); -} - -void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() { - if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) { - return; - } - - auto continuation = [&](std::pair level_file) { - // If it's being compacted it has nothing to do here. - // If this assert() fails that means that some function marked some - // files as being_compacted, but didn't call ComputeCompactionScore() - assert(!level_file.second->being_compacted); - output_level_ = start_level_ = level_file.first; - - if (start_level_ == 0 && - !compaction_picker_->level0_compactions_in_progress()->empty()) { - return false; - } - - start_level_inputs_.files = {level_file.second}; - start_level_inputs_.level = start_level_; - return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_); - }; - - for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) { - if (continuation(level_file)) { - // found the compaction! - return; - } - } - - start_level_inputs_.files.clear(); -} - -void LevelCompactionBuilder::SetupInitialFiles() { - // Find the compactions by size on all levels. - bool skipped_l0_to_base = false; - for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { - start_level_score_ = vstorage_->CompactionScore(i); - start_level_ = vstorage_->CompactionScoreLevel(i); - assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); - if (start_level_score_ >= 1) { - if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { - // If L0->base_level compaction is pending, don't schedule further - // compaction from base level. Otherwise L0->base_level compaction - // may starve. - continue; - } - output_level_ = - (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; - if (PickFileToCompact()) { - // found the compaction! - if (start_level_ == 0) { - // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` - compaction_reason_ = CompactionReason::kLevelL0FilesNum; - } else { - // L1+ score = `Level files size` / `MaxBytesForLevel` - compaction_reason_ = CompactionReason::kLevelMaxLevelSize; - } - break; - } else { - // didn't find the compaction, clear the inputs - start_level_inputs_.clear(); - if (start_level_ == 0) { - skipped_l0_to_base = true; - // L0->base_level may be blocked due to ongoing L0->base_level - // compactions. It may also be blocked by an ongoing compaction from - // base_level downwards. - // - // In these cases, to reduce L0 file count and thus reduce likelihood - // of write stalls, we can attempt compacting a span of files within - // L0. - if (PickIntraL0Compaction()) { - output_level_ = 0; - compaction_reason_ = CompactionReason::kLevelL0FilesNum; - break; - } - } - } - } - } - - // if we didn't find a compaction, check if there are any files marked for - // compaction - if (start_level_inputs_.empty()) { - parent_index_ = base_index_ = -1; - - compaction_picker_->PickFilesMarkedForCompaction( - cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_); - if (!start_level_inputs_.empty()) { - is_manual_ = true; - compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; - return; - } - } - - // Bottommost Files Compaction on deleting tombstones - if (start_level_inputs_.empty()) { - size_t i; - for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); - ++i) { - auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i]; - assert(!level_and_file.second->being_compacted); - start_level_inputs_.level = output_level_ = start_level_ = - level_and_file.first; - start_level_inputs_.files = {level_and_file.second}; - if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_)) { - break; - } - } - if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { - start_level_inputs_.clear(); - } else { - assert(!start_level_inputs_.empty()); - compaction_reason_ = CompactionReason::kBottommostFiles; - return; - } - } - - // TTL Compaction - if (start_level_inputs_.empty()) { - PickExpiredTtlFiles(); - if (!start_level_inputs_.empty()) { - compaction_reason_ = CompactionReason::kTtl; - return; - } - } - - // Periodic Compaction - if (start_level_inputs_.empty()) { - PickFilesMarkedForPeriodicCompaction(); - if (!start_level_inputs_.empty()) { - compaction_reason_ = CompactionReason::kPeriodicCompaction; - return; - } - } -} - -bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { - if (start_level_ == 0 && output_level_ != 0) { - return compaction_picker_->GetOverlappingL0Files( - vstorage_, &start_level_inputs_, output_level_, &parent_index_); - } - return true; -} - -bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { - // Setup input files from output level. For output to L0, we only compact - // spans of files that do not interact with any pending compactions, so don't - // need to consider other levels. - if (output_level_ != 0) { - output_level_inputs_.level = output_level_; - if (!compaction_picker_->SetupOtherInputs( - cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, - &output_level_inputs_, &parent_index_, base_index_)) { - return false; - } - - compaction_inputs_.push_back(start_level_inputs_); - if (!output_level_inputs_.empty()) { - compaction_inputs_.push_back(output_level_inputs_); - } - - // In some edge cases we could pick a compaction that will be compacting - // a key range that overlap with another running compaction, and both - // of them have the same output level. This could happen if - // (1) we are running a non-exclusive manual compaction - // (2) AddFile ingest a new file into the LSM tree - // We need to disallow this from happening. - if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_, - output_level_)) { - // This compaction output could potentially conflict with the output - // of a currently running compaction, we cannot run it. - return false; - } - compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, - output_level_inputs_, &grandparents_); - } else { - compaction_inputs_.push_back(start_level_inputs_); - } - return true; -} - -Compaction* LevelCompactionBuilder::PickCompaction() { - // Pick up the first file to start compaction. It may have been extended - // to a clean cut. - SetupInitialFiles(); - if (start_level_inputs_.empty()) { - return nullptr; - } - assert(start_level_ >= 0 && output_level_ >= 0); - - // If it is a L0 -> base level compaction, we need to set up other L0 - // files if needed. - if (!SetupOtherL0FilesIfNeeded()) { - return nullptr; - } - - // Pick files in the output level and expand more files in the start level - // if needed. - if (!SetupOtherInputsIfNeeded()) { - return nullptr; - } - - // Form a compaction object containing the files we picked. - Compaction* c = GetCompaction(); - - TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); - - return c; -} - -Compaction* LevelCompactionBuilder::GetCompaction() { - auto c = new Compaction( - vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_), - output_level_, - MaxFileSizeForLevel(mutable_cf_options_, output_level_, - ioptions_.compaction_style, vstorage_->base_level(), - ioptions_.level_compaction_dynamic_level_bytes), - mutable_cf_options_.max_compaction_bytes, - GetPathId(ioptions_, mutable_cf_options_, output_level_), - GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, - output_level_, vstorage_->base_level()), - GetCompressionOptions(ioptions_, vstorage_, output_level_), - /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, - start_level_score_, false /* deletion_compaction */, compaction_reason_); - - // If it's level 0 compaction, make sure we don't execute any other level 0 - // compactions in parallel - compaction_picker_->RegisterCompaction(c); - - // Creating a compaction influences the compaction score because the score - // takes running compactions into account (by skipping files that are already - // being compacted). Since we just changed compaction score, we recalculate it - // here - vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); - return c; -} - -/* - * Find the optimal path to place a file - * Given a level, finds the path where levels up to it will fit in levels - * up to and including this path - */ -uint32_t LevelCompactionBuilder::GetPathId( - const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options, int level) { - uint32_t p = 0; - assert(!ioptions.cf_paths.empty()); - - // size remaining in the most recent path - uint64_t current_path_size = ioptions.cf_paths[0].target_size; - - uint64_t level_size; - int cur_level = 0; - - // max_bytes_for_level_base denotes L1 size. - // We estimate L0 size to be the same as L1. - level_size = mutable_cf_options.max_bytes_for_level_base; - - // Last path is the fallback - while (p < ioptions.cf_paths.size() - 1) { - if (level_size <= current_path_size) { - if (cur_level == level) { - // Does desired level fit in this path? - return p; - } else { - current_path_size -= level_size; - if (cur_level > 0) { - if (ioptions.level_compaction_dynamic_level_bytes) { - // Currently, level_compaction_dynamic_level_bytes is ignored when - // multiple db paths are specified. https://github.com/facebook/ - // rocksdb/blob/master/db/column_family.cc. - // Still, adding this check to avoid accidentally using - // max_bytes_for_level_multiplier_additional - level_size = static_cast( - level_size * mutable_cf_options.max_bytes_for_level_multiplier); - } else { - level_size = static_cast( - level_size * mutable_cf_options.max_bytes_for_level_multiplier * - mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); - } - } - cur_level++; - continue; - } - } - p++; - current_path_size = ioptions.cf_paths[p].target_size; - } - return p; -} - -bool LevelCompactionBuilder::PickFileToCompact() { - // level 0 files are overlapping. So we cannot pick more - // than one concurrent compactions at this level. This - // could be made better by looking at key-ranges that are - // being compacted at level 0. - if (start_level_ == 0 && - !compaction_picker_->level0_compactions_in_progress()->empty()) { - TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); - return false; - } - - start_level_inputs_.clear(); - - assert(start_level_ >= 0); - - // Pick the largest file in this level that is not already - // being compacted - const std::vector& file_size = - vstorage_->FilesByCompactionPri(start_level_); - const std::vector& level_files = - vstorage_->LevelFiles(start_level_); - - unsigned int cmp_idx; - for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); - cmp_idx < file_size.size(); cmp_idx++) { - int index = file_size[cmp_idx]; - auto* f = level_files[index]; - - // do not pick a file to compact if it is being compacted - // from n-1 level. - if (f->being_compacted) { - continue; - } - - start_level_inputs_.files.push_back(f); - start_level_inputs_.level = start_level_; - if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &start_level_inputs_) || - compaction_picker_->FilesRangeOverlapWithCompaction( - {start_level_inputs_}, output_level_)) { - // A locked (pending compaction) input-level file was pulled in due to - // user-key overlap. - start_level_inputs_.clear(); - continue; - } - - // Now that input level is fully expanded, we check whether any output files - // are locked due to pending compaction. - // - // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- - // level files are locked, not just the extra ones pulled in for user-key - // overlap. - InternalKey smallest, largest; - compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); - CompactionInputFiles output_level_inputs; - output_level_inputs.level = output_level_; - vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, - &output_level_inputs.files); - if (!output_level_inputs.empty() && - !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, - &output_level_inputs)) { - start_level_inputs_.clear(); - continue; - } - base_index_ = index; - break; - } - - // store where to start the iteration in the next call to PickCompaction - vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); - - return start_level_inputs_.size() > 0; -} - -bool LevelCompactionBuilder::PickIntraL0Compaction() { - start_level_inputs_.clear(); - const std::vector& level_files = - vstorage_->LevelFiles(0 /* level */); - if (level_files.size() < - static_cast( - mutable_cf_options_.level0_file_num_compaction_trigger + 2) || - level_files[0]->being_compacted) { - // If L0 isn't accumulating much files beyond the regular trigger, don't - // resort to L0->L0 compaction yet. - return false; - } - return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, - port::kMaxUint64, &start_level_inputs_); -} -} // namespace - -Compaction* LevelCompactionPicker::PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer) { - LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, - mutable_cf_options, ioptions_); - return builder.PickCompaction(); -} - } // namespace rocksdb diff --git a/db/compaction_picker.h b/db/compaction/compaction_picker.h similarity index 80% rename from db/compaction_picker.h rename to db/compaction/compaction_picker.h index 01f5495e67b..ae158059a1b 100644 --- a/db/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -15,7 +15,7 @@ #include #include -#include "db/compaction.h" +#include "db/compaction/compaction.h" #include "db/version_set.h" #include "options/cf_options.h" #include "rocksdb/env.h" @@ -24,11 +24,26 @@ namespace rocksdb { +// The file contains an abstract class CompactionPicker, and its two +// sub-classes LevelCompactionPicker and NullCompactionPicker, as +// well as some helper functions used by them. + class LogBuffer; class Compaction; class VersionStorageInfo; struct CompactionInputFiles; +// An abstract class to pick compactions from an existing LSM-tree. +// +// Each compaction style inherits the class and implement the +// interface to form automatic compactions. If NeedCompaction() is true, +// then call PickCompaction() to find what files need to be compacted +// and where to put the output files. +// +// Non-virtual functions CompactRange() and CompactFiles() are used to +// pick files to compact based on users' DB::CompactRange() and +// DB::CompactFiles() requests, respectively. There is little +// compaction style specific logic for them. class CompactionPicker { public: CompactionPicker(const ImmutableCFOptions& ioptions, @@ -39,10 +54,10 @@ class CompactionPicker { // Returns nullptr if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that // describes the compaction. Caller should delete the result. - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) = 0; + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0; // Return a compaction object for compacting the range [begin,end] in // the specified level. Returns nullptr if there is nothing in that @@ -221,21 +236,9 @@ class CompactionPicker { const InternalKeyComparator* const icmp_; }; -class LevelCompactionPicker : public CompactionPicker { - public: - LevelCompactionPicker(const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icmp) - : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override; - - virtual bool NeedsCompaction( - const VersionStorageInfo* vstorage) const override; -}; - #ifndef ROCKSDB_LITE +// A dummy compaction that never triggers any automatic +// compaction. class NullCompactionPicker : public CompactionPicker { public: NullCompactionPicker(const ImmutableCFOptions& ioptions, @@ -244,10 +247,11 @@ class NullCompactionPicker : public CompactionPicker { virtual ~NullCompactionPicker() {} // Always return "nullptr" - Compaction* PickCompaction(const std::string& /*cf_name*/, - const MutableCFOptions& /*mutable_cf_options*/, - VersionStorageInfo* /*vstorage*/, - LogBuffer* /*log_buffer*/) override { + Compaction* PickCompaction( + const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, + SequenceNumber /* earliest_memtable_seqno */) override { return nullptr; } @@ -273,10 +277,27 @@ class NullCompactionPicker : public CompactionPicker { }; #endif // !ROCKSDB_LITE -bool FindIntraL0Compaction(const std::vector& level_files, - size_t min_files_to_compact, - uint64_t max_compact_bytes_per_del_file, - CompactionInputFiles* comp_inputs); +// Attempts to find an intra L0 compaction conforming to the given parameters. +// +// @param level_files Metadata for L0 files. +// @param min_files_to_compact Minimum number of files required to +// do the compaction. +// @param max_compact_bytes_per_del_file Maximum average size in bytes per +// file that is going to get deleted by +// the compaction. +// @param max_compaction_bytes Maximum total size in bytes (in terms +// of compensated file size) for files +// to be compacted. +// @param [out] comp_inputs If a compaction was found, will be +// initialized with corresponding input +// files. Cannot be nullptr. +// +// @return true iff compaction was found. +bool FindIntraL0Compaction( + const std::vector& level_files, size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs, + SequenceNumber earliest_mem_seqno = kMaxSequenceNumber); CompressionType GetCompressionType(const ImmutableCFOptions& ioptions, const VersionStorageInfo* vstorage, diff --git a/db/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc similarity index 93% rename from db/compaction_picker_fifo.cc rename to db/compaction/compaction_picker_fifo.cc index 1322989e568..cdf5e46dab6 100644 --- a/db/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -7,18 +7,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_fifo.h" #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include "db/column_family.h" -#include "util/log_buffer.h" +#include "logging/log_buffer.h" #include "util/string_util.h" namespace rocksdb { @@ -58,6 +54,15 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction( } const uint64_t current_time = static_cast(_current_time); + if (!level0_compactions_in_progress_.empty()) { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Already executing compaction. No need " + "to run parallel compactions since compactions are very fast", + cf_name.c_str()); + return nullptr; + } + std::vector inputs; inputs.emplace_back(); inputs[0].level = 0; @@ -134,7 +139,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( mutable_cf_options .level0_file_num_compaction_trigger /* min_files_to_compact */ , - max_compact_bytes_per_del_file, &comp_inputs)) { + max_compact_bytes_per_del_file, + mutable_cf_options.max_compaction_bytes, &comp_inputs)) { Compaction* c = new Compaction( vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */, @@ -196,7 +202,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( Compaction* FIFOCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber /*earliest_memtable_seqno*/) { assert(vstorage->num_levels() == 1); Compaction* c = nullptr; diff --git a/db/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h similarity index 86% rename from db/compaction_picker_fifo.h rename to db/compaction/compaction_picker_fifo.h index 9da107c5d4a..065faef1398 100644 --- a/db/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -10,7 +10,7 @@ #pragma once #ifndef ROCKSDB_LITE -#include "db/compaction_picker.h" +#include "db/compaction/compaction_picker.h" namespace rocksdb { class FIFOCompactionPicker : public CompactionPicker { @@ -19,10 +19,10 @@ class FIFOCompactionPicker : public CompactionPicker { const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* version, - LogBuffer* log_buffer) override; + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* version, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; virtual Compaction* CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc new file mode 100644 index 00000000000..4c2afa66709 --- /dev/null +++ b/db/compaction/compaction_picker_level.cc @@ -0,0 +1,558 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include + +#include "db/compaction/compaction_picker_level.h" +#include "logging/log_buffer.h" +#include "test_util/sync_point.h" + +namespace rocksdb { + +bool LevelCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + if (!vstorage->ExpiredTtlFiles().empty()) { + return true; + } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } + if (!vstorage->BottommostFilesMarkedForCompaction().empty()) { + return true; + } + if (!vstorage->FilesMarkedForCompaction().empty()) { + return true; + } + for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { + if (vstorage->CompactionScore(i) >= 1) { + return true; + } + } + return false; +} + +namespace { +// A class to build a leveled compaction step-by-step. +class LevelCompactionBuilder { + public: + LevelCompactionBuilder(const std::string& cf_name, + VersionStorageInfo* vstorage, + SequenceNumber earliest_mem_seqno, + CompactionPicker* compaction_picker, + LogBuffer* log_buffer, + const MutableCFOptions& mutable_cf_options, + const ImmutableCFOptions& ioptions) + : cf_name_(cf_name), + vstorage_(vstorage), + earliest_mem_seqno_(earliest_mem_seqno), + compaction_picker_(compaction_picker), + log_buffer_(log_buffer), + mutable_cf_options_(mutable_cf_options), + ioptions_(ioptions) {} + + // Pick and return a compaction. + Compaction* PickCompaction(); + + // Pick the initial files to compact to the next level. (or together + // in Intra-L0 compactions) + void SetupInitialFiles(); + + // If the initial files are from L0 level, pick other L0 + // files if needed. + bool SetupOtherL0FilesIfNeeded(); + + // Based on initial files, setup other files need to be compacted + // in this compaction, accordingly. + bool SetupOtherInputsIfNeeded(); + + Compaction* GetCompaction(); + + // For the specfied level, pick a file that we want to compact. + // Returns false if there is no file to compact. + // If it returns true, inputs->files.size() will be exactly one. + // If level is 0 and there is already a compaction on that level, this + // function will return false. + bool PickFileToCompact(); + + // For L0->L0, picks the longest span of files that aren't currently + // undergoing compaction for which work-per-deleted-file decreases. The span + // always starts from the newest L0 file. + // + // Intra-L0 compaction is independent of all other files, so it can be + // performed even when L0->base_level compactions are blocked. + // + // Returns true if `inputs` is populated with a span of files to be compacted; + // otherwise, returns false. + bool PickIntraL0Compaction(); + + void PickExpiredTtlFiles(); + + void PickFilesMarkedForPeriodicCompaction(); + + const std::string& cf_name_; + VersionStorageInfo* vstorage_; + SequenceNumber earliest_mem_seqno_; + CompactionPicker* compaction_picker_; + LogBuffer* log_buffer_; + int start_level_ = -1; + int output_level_ = -1; + int parent_index_ = -1; + int base_index_ = -1; + double start_level_score_ = 0; + bool is_manual_ = false; + CompactionInputFiles start_level_inputs_; + std::vector compaction_inputs_; + CompactionInputFiles output_level_inputs_; + std::vector grandparents_; + CompactionReason compaction_reason_ = CompactionReason::kUnknown; + + const MutableCFOptions& mutable_cf_options_; + const ImmutableCFOptions& ioptions_; + // Pick a path ID to place a newly generated file, with its level + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + int level); + + static const int kMinFilesForIntraL0Compaction = 4; +}; + +void LevelCompactionBuilder::PickExpiredTtlFiles() { + if (vstorage_->ExpiredTtlFiles().empty()) { + return; + } + + auto continuation = [&](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + start_level_ = level_file.first; + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + + if ((start_level_ == vstorage_->num_non_empty_levels() - 1) || + (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty())) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->ExpiredTtlFiles()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() { + if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + return; + } + + auto continuation = [&](std::pair level_file) { + // If it's being compacted it has nothing to do here. + // If this assert() fails that means that some function marked some + // files as being_compacted, but didn't call ComputeCompactionScore() + assert(!level_file.second->being_compacted); + output_level_ = start_level_ = level_file.first; + + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + return false; + } + + start_level_inputs_.files = {level_file.second}; + start_level_inputs_.level = start_level_; + return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_); + }; + + for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) { + if (continuation(level_file)) { + // found the compaction! + return; + } + } + + start_level_inputs_.files.clear(); +} + +void LevelCompactionBuilder::SetupInitialFiles() { + // Find the compactions by size on all levels. + bool skipped_l0_to_base = false; + for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { + start_level_score_ = vstorage_->CompactionScore(i); + start_level_ = vstorage_->CompactionScoreLevel(i); + assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); + if (start_level_score_ >= 1) { + if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) { + // If L0->base_level compaction is pending, don't schedule further + // compaction from base level. Otherwise L0->base_level compaction + // may starve. + continue; + } + output_level_ = + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + if (PickFileToCompact()) { + // found the compaction! + if (start_level_ == 0) { + // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + } else { + // L1+ score = `Level files size` / `MaxBytesForLevel` + compaction_reason_ = CompactionReason::kLevelMaxLevelSize; + } + break; + } else { + // didn't find the compaction, clear the inputs + start_level_inputs_.clear(); + if (start_level_ == 0) { + skipped_l0_to_base = true; + // L0->base_level may be blocked due to ongoing L0->base_level + // compactions. It may also be blocked by an ongoing compaction from + // base_level downwards. + // + // In these cases, to reduce L0 file count and thus reduce likelihood + // of write stalls, we can attempt compacting a span of files within + // L0. + if (PickIntraL0Compaction()) { + output_level_ = 0; + compaction_reason_ = CompactionReason::kLevelL0FilesNum; + break; + } + } + } + } + } + + // if we didn't find a compaction, check if there are any files marked for + // compaction + if (start_level_inputs_.empty()) { + parent_index_ = base_index_ = -1; + + compaction_picker_->PickFilesMarkedForCompaction( + cf_name_, vstorage_, &start_level_, &output_level_, + &start_level_inputs_); + if (!start_level_inputs_.empty()) { + is_manual_ = true; + compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; + return; + } + } + + // Bottommost Files Compaction on deleting tombstones + if (start_level_inputs_.empty()) { + size_t i; + for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size(); + ++i) { + auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i]; + assert(!level_and_file.second->being_compacted); + start_level_inputs_.level = output_level_ = start_level_ = + level_and_file.first; + start_level_inputs_.files = {level_and_file.second}; + if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_)) { + break; + } + } + if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) { + start_level_inputs_.clear(); + } else { + assert(!start_level_inputs_.empty()); + compaction_reason_ = CompactionReason::kBottommostFiles; + return; + } + } + + // TTL Compaction + if (start_level_inputs_.empty()) { + PickExpiredTtlFiles(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kTtl; + return; + } + } + + // Periodic Compaction + if (start_level_inputs_.empty()) { + PickFilesMarkedForPeriodicCompaction(); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kPeriodicCompaction; + return; + } + } +} + +bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { + if (start_level_ == 0 && output_level_ != 0) { + return compaction_picker_->GetOverlappingL0Files( + vstorage_, &start_level_inputs_, output_level_, &parent_index_); + } + return true; +} + +bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { + // Setup input files from output level. For output to L0, we only compact + // spans of files that do not interact with any pending compactions, so don't + // need to consider other levels. + if (output_level_ != 0) { + output_level_inputs_.level = output_level_; + if (!compaction_picker_->SetupOtherInputs( + cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_, + &output_level_inputs_, &parent_index_, base_index_)) { + return false; + } + + compaction_inputs_.push_back(start_level_inputs_); + if (!output_level_inputs_.empty()) { + compaction_inputs_.push_back(output_level_inputs_); + } + + // In some edge cases we could pick a compaction that will be compacting + // a key range that overlap with another running compaction, and both + // of them have the same output level. This could happen if + // (1) we are running a non-exclusive manual compaction + // (2) AddFile ingest a new file into the LSM tree + // We need to disallow this from happening. + if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_, + output_level_)) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + return false; + } + compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, + output_level_inputs_, &grandparents_); + } else { + compaction_inputs_.push_back(start_level_inputs_); + } + return true; +} + +Compaction* LevelCompactionBuilder::PickCompaction() { + // Pick up the first file to start compaction. It may have been extended + // to a clean cut. + SetupInitialFiles(); + if (start_level_inputs_.empty()) { + return nullptr; + } + assert(start_level_ >= 0 && output_level_ >= 0); + + // If it is a L0 -> base level compaction, we need to set up other L0 + // files if needed. + if (!SetupOtherL0FilesIfNeeded()) { + return nullptr; + } + + // Pick files in the output level and expand more files in the start level + // if needed. + if (!SetupOtherInputsIfNeeded()) { + return nullptr; + } + + // Form a compaction object containing the files we picked. + Compaction* c = GetCompaction(); + + TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c); + + return c; +} + +Compaction* LevelCompactionBuilder::GetCompaction() { + auto c = new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_), + output_level_, + MaxFileSizeForLevel(mutable_cf_options_, output_level_, + ioptions_.compaction_style, vstorage_->base_level(), + ioptions_.level_compaction_dynamic_level_bytes), + mutable_cf_options_.max_compaction_bytes, + GetPathId(ioptions_, mutable_cf_options_, output_level_), + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level_, vstorage_->base_level()), + GetCompressionOptions(ioptions_, vstorage_, output_level_), + /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, + start_level_score_, false /* deletion_compaction */, compaction_reason_); + + // If it's level 0 compaction, make sure we don't execute any other level 0 + // compactions in parallel + compaction_picker_->RegisterCompaction(c); + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); + return c; +} + +/* + * Find the optimal path to place a file + * Given a level, finds the path where levels up to it will fit in levels + * up to and including this path + */ +uint32_t LevelCompactionBuilder::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, int level) { + uint32_t p = 0; + assert(!ioptions.cf_paths.empty()); + + // size remaining in the most recent path + uint64_t current_path_size = ioptions.cf_paths[0].target_size; + + uint64_t level_size; + int cur_level = 0; + + // max_bytes_for_level_base denotes L1 size. + // We estimate L0 size to be the same as L1. + level_size = mutable_cf_options.max_bytes_for_level_base; + + // Last path is the fallback + while (p < ioptions.cf_paths.size() - 1) { + if (level_size <= current_path_size) { + if (cur_level == level) { + // Does desired level fit in this path? + return p; + } else { + current_path_size -= level_size; + if (cur_level > 0) { + if (ioptions.level_compaction_dynamic_level_bytes) { + // Currently, level_compaction_dynamic_level_bytes is ignored when + // multiple db paths are specified. https://github.com/facebook/ + // rocksdb/blob/master/db/column_family.cc. + // Still, adding this check to avoid accidentally using + // max_bytes_for_level_multiplier_additional + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier); + } else { + level_size = static_cast( + level_size * mutable_cf_options.max_bytes_for_level_multiplier * + mutable_cf_options.MaxBytesMultiplerAdditional(cur_level)); + } + } + cur_level++; + continue; + } + } + p++; + current_path_size = ioptions.cf_paths[p].target_size; + } + return p; +} + +bool LevelCompactionBuilder::PickFileToCompact() { + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (start_level_ == 0 && + !compaction_picker_->level0_compactions_in_progress()->empty()) { + TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0"); + return false; + } + + start_level_inputs_.clear(); + + assert(start_level_ >= 0); + + // Pick the largest file in this level that is not already + // being compacted + const std::vector& file_size = + vstorage_->FilesByCompactionPri(start_level_); + const std::vector& level_files = + vstorage_->LevelFiles(start_level_); + + unsigned int cmp_idx; + for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); + cmp_idx < file_size.size(); cmp_idx++) { + int index = file_size[cmp_idx]; + auto* f = level_files[index]; + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + continue; + } + + start_level_inputs_.files.push_back(f); + start_level_inputs_.level = start_level_; + if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &start_level_inputs_) || + compaction_picker_->FilesRangeOverlapWithCompaction( + {start_level_inputs_}, output_level_)) { + // A locked (pending compaction) input-level file was pulled in due to + // user-key overlap. + start_level_inputs_.clear(); + continue; + } + + // Now that input level is fully expanded, we check whether any output files + // are locked due to pending compaction. + // + // Note we rely on ExpandInputsToCleanCut() to tell us whether any output- + // level files are locked, not just the extra ones pulled in for user-key + // overlap. + InternalKey smallest, largest; + compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest); + CompactionInputFiles output_level_inputs; + output_level_inputs.level = output_level_; + vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, + &output_level_inputs.files); + if (!output_level_inputs.empty() && + !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, + &output_level_inputs)) { + start_level_inputs_.clear(); + continue; + } + base_index_ = index; + break; + } + + // store where to start the iteration in the next call to PickCompaction + vstorage_->SetNextCompactionIndex(start_level_, cmp_idx); + + return start_level_inputs_.size() > 0; +} + +bool LevelCompactionBuilder::PickIntraL0Compaction() { + start_level_inputs_.clear(); + const std::vector& level_files = + vstorage_->LevelFiles(0 /* level */); + if (level_files.size() < + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger + 2) || + level_files[0]->being_compacted) { + // If L0 isn't accumulating much files beyond the regular trigger, don't + // resort to L0->L0 compaction yet. + return false; + } + return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, + port::kMaxUint64, + mutable_cf_options_.max_compaction_bytes, + &start_level_inputs_, earliest_mem_seqno_); +} +} // namespace + +Compaction* LevelCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_mem_seqno) { + LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this, + log_buffer, mutable_cf_options, ioptions_); + return builder.PickCompaction(); +} +} // namespace rocksdb diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h new file mode 100644 index 00000000000..c8d905ef90f --- /dev/null +++ b/db/compaction/compaction_picker_level.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/compaction/compaction_picker.h" + +namespace rocksdb { +// Picking compactions for leveled compaction. See wiki page +// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +// for description of Leveled compaction. +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; + +} // namespace rocksdb diff --git a/db/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc similarity index 83% rename from db/compaction_picker_test.cc rename to db/compaction/compaction_picker_test.cc index 31325c12893..5cb3350d648 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -3,19 +3,19 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/compaction_picker.h" #include #include #include -#include "db/compaction.h" -#include "db/compaction_picker_fifo.h" -#include "db/compaction_picker_universal.h" - -#include "util/logging.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker_fifo.h" +#include "db/compaction/compaction_picker_level.h" +#include "db/compaction/compaction_picker_universal.h" + +#include "logging/logging.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -57,6 +57,8 @@ class CompactionPickerTest : public testing::Test { log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_), file_num_(1), vstorage_(nullptr) { + mutable_cf_options_.ttl = 0; + mutable_cf_options_.periodic_compaction_seconds = 0; // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of // tests to cover. ioptions_.compaction_pri = kByCompensatedSize; @@ -88,15 +90,14 @@ class CompactionPickerTest : public testing::Test { SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, size_t compensated_file_size = 0) { assert(level < vstorage_->num_levels()); - FileMetaData* f = new FileMetaData; - f->fd = FileDescriptor(file_number, path_id, file_size); - f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); - f->largest = InternalKey(largest, largest_seq, kTypeValue); - f->fd.smallest_seqno = smallest_seq; - f->fd.largest_seqno = largest_seq; + FileMetaData* f = new FileMetaData( + file_number, path_id, file_size, + InternalKey(smallest, smallest_seq, kTypeValue), + InternalKey(largest, largest_seq, kTypeValue), smallest_seq, + largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; - f->refs = 0; vstorage_->AddFile(level, f); files_.emplace_back(f); file_map_.insert({file_number, {f, level}}); @@ -501,6 +502,168 @@ TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) { ASSERT_TRUE(compaction->is_trivial_move()); } +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) { + // The case where universal periodic compaction can be picked + // with some newer files being compacted. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 4U, "260", "300", kFileSize, 0, 260, 300); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->output_level()); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) { + // The case where universal periodic compaction does not + // pick up only level to compact if it doesn't cover + // any file marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) { + // The case where universal periodic compaction does not + // pick up only the last sorted run which is an L0 file if it isn't + // marked as periodic compaction. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(0, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(0, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[5].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + + ASSERT_FALSE(compaction); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { + // The case where universal periodic compaction couldn't form + // a compaction that inlcudes any file marked for periodic compaction. + // Right now we form the compaction anyway if it is more than one + // sorted run. Just put the case here to validate that it doesn't + // crash. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); + Add(2, 2U, "010", "080", kFileSize, 0, 200, 251); + Add(3, 5U, "010", "080", kFileSize, 0, 200, 251); + Add(4, 3U, "301", "350", kFileSize, 0, 101, 150); + Add(4, 6U, "501", "750", kFileSize, 0, 101, 150); + + file_map_[2].first->being_compacted = true; + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(!compaction || + compaction->start_level() != compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) { + // Test single L0 file periodic compaction triggering. + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(0, 6U, "150", "200", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(0, compaction->start_level()); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) { + // Test single sorted run non-L0 periodic compaction + const uint64_t kFileSize = 100000; + + mutable_cf_options_.periodic_compaction_seconds = 1000; + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + + NewVersionStorage(5, kCompactionStyleUniversal); + + Add(4, 5U, "150", "200", kFileSize, 0, 500, 550); + Add(4, 6U, "350", "400", kFileSize, 0, 500, 550); + UpdateVersionStorageInfo(); + vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first); + + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction); + ASSERT_EQ(4, compaction->start_level()); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(4, compaction->output_level()); +} + TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { NewVersionStorage(1, kCompactionStyleFIFO); const int kFileCount = @@ -1478,6 +1641,97 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); } +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 1000000u; + NewVersionStorage(6, kCompactionStyleLevel); + + // All 5 L0 files will be picked for intra L0 compaction. The one L1 file + // spans entire L0 key range and is marked as being compacted to avoid + // L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 110, 111); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(5U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 5 L0 files will be picked for intra L0 compaction due to + // max_compaction_bytes limit (the minimum number of files for triggering + // intra L0 compaction is 4). The one L1 file spans entire L0 key range and + // is marked as being compacted to avoid L0->L1 compaction. + Add(0, 1U, "100", "150", 200000U, 0, 100, 101); + Add(0, 2U, "151", "200", 200000U, 0, 102, 103); + Add(0, 3U, "201", "250", 200000U, 0, 104, 105); + Add(0, 4U, "251", "300", 200000U, 0, 106, 107); + Add(0, 5U, "301", "350", 200000U, 0, 108, 109); + Add(1, 6U, "100", "350", 200000U, 0, 109, 110); + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + +TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) { + // Intra L0 compaction triggers only if there are at least + // level0_file_num_compaction_trigger + 2 L0 files. + mutable_cf_options_.level0_file_num_compaction_trigger = 3; + mutable_cf_options_.max_compaction_bytes = 999999u; + NewVersionStorage(6, kCompactionStyleLevel); + + // 4 out of 6 L0 files will be picked for intra L0 compaction due to + // being_compact limit. And the latest one L0 will be skipped due to earliest + // seqno. The one L1 file spans entire L0 key range and is marked as being + // compacted to avoid L0->L1 compaction. + Add(1, 1U, "100", "350", 200000U, 0, 110, 111); + Add(0, 2U, "301", "350", 1U, 0, 108, 109); + Add(0, 3U, "251", "300", 1U, 0, 106, 107); + Add(0, 4U, "201", "250", 1U, 0, 104, 105); + Add(0, 5U, "151", "200", 1U, 0, 102, 103); + Add(0, 6U, "100", "150", 1U, 0, 100, 101); + Add(0, 7U, "100", "100", 1U, 0, 99, 100); + vstorage_->LevelFiles(0)[5]->being_compacted = true; + vstorage_->LevelFiles(1)[0]->being_compacted = true; + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_levels()); + ASSERT_EQ(4U, compaction->num_input_files(0)); + ASSERT_EQ(CompactionReason::kLevelL0FilesNum, + compaction->compaction_reason()); + ASSERT_EQ(0, compaction->output_level()); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc similarity index 57% rename from db/compaction_picker_universal.cc rename to db/compaction/compaction_picker_universal.cc index 9291178585a..473a480cbc2 100644 --- a/db/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -7,28 +7,129 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/compaction_picker_universal.h" +#include "db/compaction/compaction_picker_universal.h" #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include #include #include "db/column_family.h" +#include "file/filename.h" +#include "logging/log_buffer.h" #include "monitoring/statistics.h" -#include "util/filename.h" -#include "util/log_buffer.h" +#include "test_util/sync_point.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { namespace { +// A helper class that form universal compactions. The class is used by +// UniversalCompactionPicker::PickCompaction(). +// The usage is to create the class, and get the compaction object by calling +// PickCompaction(). +class UniversalCompactionBuilder { + public: + UniversalCompactionBuilder(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp, + const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + UniversalCompactionPicker* picker, + LogBuffer* log_buffer) + : ioptions_(ioptions), + icmp_(icmp), + cf_name_(cf_name), + mutable_cf_options_(mutable_cf_options), + vstorage_(vstorage), + picker_(picker), + log_buffer_(log_buffer) {} + + // Form and return the compaction object. The caller owns return object. + Compaction* PickCompaction(); + + private: + struct SortedRun { + SortedRun(int _level, FileMetaData* _file, uint64_t _size, + uint64_t _compensated_file_size, bool _being_compacted) + : level(_level), + file(_file), + size(_size), + compensated_file_size(_compensated_file_size), + being_compacted(_being_compacted) { + assert(compensated_file_size > 0); + assert(level != 0 || file != nullptr); + } + + void Dump(char* out_buf, size_t out_buf_size, + bool print_path = false) const; + + // sorted_run_count is added into the string to print + void DumpSizeInfo(char* out_buf, size_t out_buf_size, + size_t sorted_run_count) const; + + int level; + // `file` Will be null for level > 0. For level = 0, the sorted run is + // for this file. + FileMetaData* file; + // For level > 0, `size` and `compensated_file_size` are sum of sizes all + // files in the level. `being_compacted` should be the same for all files + // in a non-zero level. Use the value here. + uint64_t size; + uint64_t compensated_file_size; + bool being_compacted; + }; + + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionToReduceSizeAmp(); + + Compaction* PickDeleteTriggeredCompaction(); + + // Form a compaction from the sorted run indicated by start_index to the + // oldest sorted run. + // The caller is responsible for making sure that those files are not in + // compaction. + Compaction* PickCompactionToOldest(size_t start_index, + CompactionReason compaction_reason); + + // Try to pick periodic compaction. The caller should only call it + // if there is at least one file marked for periodic compaction. + // null will be returned if no such a compaction can be formed + // because some files are being compacted. + Compaction* PickPeriodicCompaction(); + + // Used in universal compaction when the enabled_trivial_move + // option is set. Checks whether there are any overlapping files + // in the input. Returns true if the input files are non + // overlapping. + bool IsInputFilesNonOverlapping(Compaction* c); + + const ImmutableCFOptions& ioptions_; + const InternalKeyComparator* icmp_; + double score_; + std::vector sorted_runs_; + const std::string& cf_name_; + const MutableCFOptions& mutable_cf_options_; + VersionStorageInfo* vstorage_; + UniversalCompactionPicker* picker_; + LogBuffer* log_buffer_; + + static std::vector CalculateSortedRuns( + const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + + // Pick a path ID to place a newly generated file, with its estimated file + // size. + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + uint64_t file_size); +}; + // Used in universal compaction when trivial move is enabled. // This structure is used for the construction of min heap // that contains the file meta data, the level of the file @@ -117,7 +218,7 @@ void GetSmallestLargestSeqno(const std::vector& files, // Algorithm that checks to see if there are any overlapping // files in the input -bool UniversalCompactionPicker::IsInputFilesNonOverlapping(Compaction* c) { +bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) { auto comparator = icmp_->user_comparator(); int first_iter = 1; @@ -165,15 +266,28 @@ bool UniversalCompactionPicker::NeedsCompaction( if (vstorage->CompactionScore(kLevel0) >= 1) { return true; } + if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) { + return true; + } if (!vstorage->FilesMarkedForCompaction().empty()) { return true; } return false; } -void UniversalCompactionPicker::SortedRun::Dump(char* out_buf, - size_t out_buf_size, - bool print_path) const { +Compaction* UniversalCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber /* earliest_memtable_seqno */) { + UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, + mutable_cf_options, vstorage, this, + log_buffer); + return builder.PickCompaction(); +} + +void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf, + size_t out_buf_size, + bool print_path) const { if (level == 0) { assert(file != nullptr); if (file->fd.GetPathId() == 0 || !print_path) { @@ -189,7 +303,7 @@ void UniversalCompactionPicker::SortedRun::Dump(char* out_buf, } } -void UniversalCompactionPicker::SortedRun::DumpSizeInfo( +void UniversalCompactionBuilder::SortedRun::DumpSizeInfo( char* out_buf, size_t out_buf_size, size_t sorted_run_count) const { if (level == 0) { assert(file != nullptr); @@ -208,11 +322,11 @@ void UniversalCompactionPicker::SortedRun::DumpSizeInfo( } } -std::vector -UniversalCompactionPicker::CalculateSortedRuns( +std::vector +UniversalCompactionBuilder::CalculateSortedRuns( const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/, const MutableCFOptions& mutable_cf_options) { - std::vector ret; + std::vector ret; for (FileMetaData* f : vstorage.LevelFiles(0)) { ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, f->being_compacted); @@ -235,8 +349,8 @@ UniversalCompactionPicker::CalculateSortedRuns( // non-zero level, all the files should share the same being_compacted // value. // This assumption is only valid when - // mutable_cf_options.compaction_options_universal.allow_trivial_move is - // false + // mutable_cf_options.compaction_options_universal.allow_trivial_move + // is false assert(is_first || f->being_compacted == being_compacted); } if (is_first) { @@ -254,65 +368,68 @@ UniversalCompactionPicker::CalculateSortedRuns( // Universal style of compaction. Pick files that are contiguous in // time-range to compact. -Compaction* UniversalCompactionPicker::PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, LogBuffer* log_buffer) { +Compaction* UniversalCompactionBuilder::PickCompaction() { const int kLevel0 = 0; - double score = vstorage->CompactionScore(kLevel0); - std::vector sorted_runs = - CalculateSortedRuns(*vstorage, ioptions_, mutable_cf_options); - - if (sorted_runs.size() == 0 || - (vstorage->FilesMarkedForCompaction().empty() && - sorted_runs.size() < (unsigned int)mutable_cf_options - .level0_file_num_compaction_trigger)) { - ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: nothing to do\n", - cf_name.c_str()); - TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return", - nullptr); + score_ = vstorage_->CompactionScore(kLevel0); + sorted_runs_ = + CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_); + + if (sorted_runs_.size() == 0 || + (vstorage_->FilesMarkedForPeriodicCompaction().empty() && + vstorage_->FilesMarkedForCompaction().empty() && + sorted_runs_.size() < (unsigned int)mutable_cf_options_ + .level0_file_num_compaction_trigger)) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n", + cf_name_.c_str()); + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); return nullptr; } VersionStorageInfo::LevelSummaryStorage tmp; ROCKS_LOG_BUFFER_MAX_SZ( - log_buffer, 3072, + log_buffer_, 3072, "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n", - cf_name.c_str(), sorted_runs.size(), vstorage->LevelSummary(&tmp)); + cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp)); - // Check for size amplification first. Compaction* c = nullptr; - if (sorted_runs.size() >= - static_cast( - mutable_cf_options.level0_file_num_compaction_trigger)) { - if ((c = PickCompactionToReduceSizeAmp(cf_name, mutable_cf_options, - vstorage, score, sorted_runs, - log_buffer)) != nullptr) { - ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: compacting for size amp\n", - cf_name.c_str()); + // Periodic compaction has higher priority than other type of compaction + // because it's a hard requirement. + if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { + // Always need to do a full compaction for periodic compaction. + c = PickPeriodicCompaction(); + } + + // Check for size amplification. + if (c == nullptr && + sorted_runs_.size() >= + static_cast( + mutable_cf_options_.level0_file_num_compaction_trigger)) { + if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", + cf_name_.c_str()); } else { // Size amplification is within limits. Try reducing read // amplification while maintaining file size ratios. unsigned int ratio = - mutable_cf_options.compaction_options_universal.size_ratio; + mutable_cf_options_.compaction_options_universal.size_ratio; - if ((c = PickCompactionToReduceSortedRuns( - cf_name, mutable_cf_options, vstorage, score, ratio, UINT_MAX, - sorted_runs, log_buffer)) != nullptr) { - ROCKS_LOG_BUFFER(log_buffer, + if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size ratio\n", - cf_name.c_str()); + cf_name_.c_str()); } else { // Size amplification and file size ratios are within configured limits. // If max read amplification is exceeding configured limits, then force // compaction without looking at filesize ratios and try to reduce // the number of files to fewer than level0_file_num_compaction_trigger. // This is guaranteed by NeedsCompaction() - assert(sorted_runs.size() >= + assert(sorted_runs_.size() >= static_cast( - mutable_cf_options.level0_file_num_compaction_trigger)); + mutable_cf_options_.level0_file_num_compaction_trigger)); // Get the total number of sorted runs that are not being compacted int num_sr_not_compacted = 0; - for (size_t i = 0; i < sorted_runs.size(); i++) { - if (sorted_runs[i].being_compacted == false) { + for (size_t i = 0; i < sorted_runs_.size(); i++) { + if (sorted_runs_[i].being_compacted == false) { num_sr_not_compacted++; } } @@ -320,16 +437,15 @@ Compaction* UniversalCompactionPicker::PickCompaction( // The number of sorted runs that are not being compacted is greater // than the maximum allowed number of sorted runs if (num_sr_not_compacted > - mutable_cf_options.level0_file_num_compaction_trigger) { + mutable_cf_options_.level0_file_num_compaction_trigger) { unsigned int num_files = num_sr_not_compacted - - mutable_cf_options.level0_file_num_compaction_trigger + 1; - if ((c = PickCompactionToReduceSortedRuns( - cf_name, mutable_cf_options, vstorage, score, UINT_MAX, - num_files, sorted_runs, log_buffer)) != nullptr) { - ROCKS_LOG_BUFFER(log_buffer, + mutable_cf_options_.level0_file_num_compaction_trigger + 1; + if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) != + nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for file num -- %u\n", - cf_name.c_str(), num_files); + cf_name_.c_str(), num_files); } } } @@ -337,23 +453,22 @@ Compaction* UniversalCompactionPicker::PickCompaction( } if (c == nullptr) { - if ((c = PickDeleteTriggeredCompaction(cf_name, mutable_cf_options, - vstorage, score, sorted_runs, - log_buffer)) != nullptr) { - ROCKS_LOG_BUFFER(log_buffer, + if ((c = PickDeleteTriggeredCompaction()) != nullptr) { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: delete triggered compaction\n", - cf_name.c_str()); + cf_name_.c_str()); } } if (c == nullptr) { - TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return", - nullptr); + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionBuilder::PickCompaction:Return", nullptr); return nullptr; } - if (mutable_cf_options.compaction_options_universal.allow_trivial_move == - true) { + if (mutable_cf_options_.compaction_options_universal.allow_trivial_move == + true && + c->compaction_reason() != CompactionReason::kPeriodicCompaction) { c->set_is_trivial_move(IsInputFilesNonOverlapping(c)); } @@ -398,15 +513,15 @@ Compaction* UniversalCompactionPicker::PickCompaction( RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION, c->inputs(0)->size()); - RegisterCompaction(c); - vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options); + picker_->RegisterCompaction(c); + vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_); - TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return", + TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return", c); return c; } -uint32_t UniversalCompactionPicker::GetPathId( +uint32_t UniversalCompactionBuilder::GetPathId( const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options, uint64_t file_size) { // Two conditions need to be satisfied: @@ -444,15 +559,12 @@ uint32_t UniversalCompactionPicker::GetPathId( // Consider compaction files based on their size differences with // the next file in time order. // -Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, double score, unsigned int ratio, - unsigned int max_number_of_files_to_compact, - const std::vector& sorted_runs, LogBuffer* log_buffer) { +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( + unsigned int ratio, unsigned int max_number_of_files_to_compact) { unsigned int min_merge_width = - mutable_cf_options.compaction_options_universal.min_merge_width; + mutable_cf_options_.compaction_options_universal.min_merge_width; unsigned int max_merge_width = - mutable_cf_options.compaction_options_universal.max_merge_width; + mutable_cf_options_.compaction_options_universal.max_merge_width; const SortedRun* sr = nullptr; bool done = false; @@ -466,16 +578,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( // Caller checks the size before executing this function. This invariant is // important because otherwise we may have a possible integer underflow when // dealing with unsigned types. - assert(sorted_runs.size() > 0); + assert(sorted_runs_.size() > 0); // Considers a candidate file only if it is smaller than the // total size accumulated so far. - for (size_t loop = 0; loop < sorted_runs.size(); loop++) { + for (size_t loop = 0; loop < sorted_runs_.size(); loop++) { candidate_count = 0; // Skip files that are already being compacted - for (sr = nullptr; loop < sorted_runs.size(); loop++) { - sr = &sorted_runs[loop]; + for (sr = nullptr; loop < sorted_runs_.size(); loop++) { + sr = &sorted_runs_[loop]; if (!sr->being_compacted) { candidate_count = 1; @@ -483,10 +595,10 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( } char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf)); - ROCKS_LOG_BUFFER(log_buffer, + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s" "[%d] being compacted, skipping", - cf_name.c_str(), file_num_buf, loop); + cf_name_.c_str(), file_num_buf, loop); sr = nullptr; } @@ -497,15 +609,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( if (sr != nullptr) { char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); - ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Possible candidate %s[%d].", - cf_name.c_str(), file_num_buf, loop); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Possible candidate %s[%d].", + cf_name_.c_str(), file_num_buf, loop); } // Check if the succeeding files need compaction. for (size_t i = loop + 1; - candidate_count < max_files_to_compact && i < sorted_runs.size(); + candidate_count < max_files_to_compact && i < sorted_runs_.size(); i++) { - const SortedRun* succeeding_sr = &sorted_runs[i]; + const SortedRun* succeeding_sr = &sorted_runs_[i]; if (succeeding_sr->being_compacted) { break; } @@ -519,7 +632,7 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( if (sz < static_cast(succeeding_sr->size)) { break; } - if (mutable_cf_options.compaction_options_universal.stop_style == + if (mutable_cf_options_.compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) { // Similar-size stopping rule: also check the last picked file isn't // far larger than the next candidate file. @@ -545,12 +658,12 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( break; } else { for (size_t i = loop; - i < loop + candidate_count && i < sorted_runs.size(); i++) { - const SortedRun* skipping_sr = &sorted_runs[i]; + i < loop + candidate_count && i < sorted_runs_.size(); i++) { + const SortedRun* skipping_sr = &sorted_runs_[i]; char file_num_buf[256]; skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); - ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Skipping %s", - cf_name.c_str(), file_num_buf); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s", + cf_name_.c_str(), file_num_buf); } } } @@ -562,16 +675,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( // size ratio of compression. bool enable_compression = true; int ratio_to_compress = - mutable_cf_options.compaction_options_universal.compression_size_percent; + mutable_cf_options_.compaction_options_universal.compression_size_percent; if (ratio_to_compress >= 0) { uint64_t total_size = 0; - for (auto& sorted_run : sorted_runs) { + for (auto& sorted_run : sorted_runs_) { total_size += sorted_run.compensated_file_size; } uint64_t older_file_size = 0; - for (size_t i = sorted_runs.size() - 1; i >= first_index_after; i--) { - older_file_size += sorted_runs[i].size; + for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) { + older_file_size += sorted_runs_[i].size; if (older_file_size * 100L >= total_size * (long)ratio_to_compress) { enable_compression = false; break; @@ -581,46 +694,46 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( uint64_t estimated_total_size = 0; for (unsigned int i = 0; i < first_index_after; i++) { - estimated_total_size += sorted_runs[i].size; + estimated_total_size += sorted_runs_[i].size; } uint32_t path_id = - GetPathId(ioptions_, mutable_cf_options, estimated_total_size); - int start_level = sorted_runs[start_index].level; + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; int output_level; - if (first_index_after == sorted_runs.size()) { - output_level = vstorage->num_levels() - 1; - } else if (sorted_runs[first_index_after].level == 0) { + if (first_index_after == sorted_runs_.size()) { + output_level = vstorage_->num_levels() - 1; + } else if (sorted_runs_[first_index_after].level == 0) { output_level = 0; } else { - output_level = sorted_runs[first_index_after].level - 1; + output_level = sorted_runs_[first_index_after].level - 1; } // last level is reserved for the files ingested behind if (ioptions_.allow_ingest_behind && - (output_level == vstorage->num_levels() - 1)) { + (output_level == vstorage_->num_levels() - 1)) { assert(output_level > 1); output_level--; } - std::vector inputs(vstorage->num_levels()); + std::vector inputs(vstorage_->num_levels()); for (size_t i = 0; i < inputs.size(); ++i) { inputs[i].level = start_level + static_cast(i); } for (size_t i = start_index; i < first_index_after; i++) { - auto& picking_sr = sorted_runs[i]; + auto& picking_sr = sorted_runs_[i]; if (picking_sr.level == 0) { FileMetaData* picking_file = picking_sr.file; inputs[0].files.push_back(picking_file); } else { auto& files = inputs[picking_sr.level - start_level].files; - for (auto* f : vstorage->LevelFiles(picking_sr.level)) { + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { files.push_back(f); } } char file_num_buf[256]; picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i); - ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Picking %s", cf_name.c_str(), - file_num_buf); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s", + cf_name_.c_str(), file_num_buf); } CompactionReason compaction_reason; @@ -630,16 +743,17 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( compaction_reason = CompactionReason::kUniversalSortedRunNum; } return new Compaction( - vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level, - MaxFileSizeForLevel(mutable_cf_options, output_level, + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, kCompactionStyleUniversal), LLONG_MAX, path_id, - GetCompressionType(ioptions_, vstorage, mutable_cf_options, start_level, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level, 1, enable_compression), - GetCompressionOptions(ioptions_, vstorage, start_level, + GetCompressionOptions(ioptions_, vstorage_, start_level, enable_compression), /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, - score, false /* deletion_compaction */, compaction_reason); + score_, false /* deletion_compaction */, compaction_reason); } // Look at overall size amplification. If size amplification @@ -648,12 +762,9 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns( // base file (overrides configured values of file-size ratios, // min_merge_width and max_merge_width). // -Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, double score, - const std::vector& sorted_runs, LogBuffer* log_buffer) { +Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { // percentage flexibility while reducing size amplification - uint64_t ratio = mutable_cf_options.compaction_options_universal + uint64_t ratio = mutable_cf_options_.compaction_options_universal .max_size_amplification_percent; unsigned int candidate_count = 0; @@ -661,21 +772,23 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp( size_t start_index = 0; const SortedRun* sr = nullptr; - if (sorted_runs.back().being_compacted) { + assert(!sorted_runs_.empty()); + if (sorted_runs_.back().being_compacted) { return nullptr; } // Skip files that are already being compacted - for (size_t loop = 0; loop < sorted_runs.size() - 1; loop++) { - sr = &sorted_runs[loop]; + for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) { + sr = &sorted_runs_[loop]; if (!sr->being_compacted) { start_index = loop; // Consider this as the first candidate. break; } char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); - ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: skipping %s[%d] compacted %s", - cf_name.c_str(), file_num_buf, loop, + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: skipping %s[%d] compacted %s", + cf_name_.c_str(), file_num_buf, loop, " cannot be a candidate to reduce size amp.\n"); sr = nullptr; } @@ -687,20 +800,20 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp( char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); ROCKS_LOG_BUFFER( - log_buffer, + log_buffer_, "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s", - cf_name.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); + cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); } // keep adding up all the remaining files - for (size_t loop = start_index; loop < sorted_runs.size() - 1; loop++) { - sr = &sorted_runs[loop]; + for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) { + sr = &sorted_runs_[loop]; if (sr->being_compacted) { char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); ROCKS_LOG_BUFFER( - log_buffer, "[%s] Universal: Possible candidate %s[%d] %s", - cf_name.c_str(), file_num_buf, start_index, + log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s", + cf_name_.c_str(), file_num_buf, start_index, " is already being compacted. No size amp reduction possible.\n"); return nullptr; } @@ -712,88 +825,35 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp( } // size of earliest file - uint64_t earliest_file_size = sorted_runs.back().size; + uint64_t earliest_file_size = sorted_runs_.back().size; // size amplification = percentage of additional size if (candidate_size * 100 < ratio * earliest_file_size) { ROCKS_LOG_BUFFER( - log_buffer, + log_buffer_, "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 " earliest-file-size %" PRIu64, - cf_name.c_str(), candidate_size, earliest_file_size); + cf_name_.c_str(), candidate_size, earliest_file_size); return nullptr; } else { ROCKS_LOG_BUFFER( - log_buffer, + log_buffer_, "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 " earliest-file-size %" PRIu64, - cf_name.c_str(), candidate_size, earliest_file_size); - } - assert(start_index < sorted_runs.size() - 1); - - // Estimate total file size - uint64_t estimated_total_size = 0; - for (size_t loop = start_index; loop < sorted_runs.size(); loop++) { - estimated_total_size += sorted_runs[loop].size; - } - uint32_t path_id = - GetPathId(ioptions_, mutable_cf_options, estimated_total_size); - int start_level = sorted_runs[start_index].level; - - std::vector inputs(vstorage->num_levels()); - for (size_t i = 0; i < inputs.size(); ++i) { - inputs[i].level = start_level + static_cast(i); + cf_name_.c_str(), candidate_size, earliest_file_size); } - // We always compact all the files, so always compress. - for (size_t loop = start_index; loop < sorted_runs.size(); loop++) { - auto& picking_sr = sorted_runs[loop]; - if (picking_sr.level == 0) { - FileMetaData* f = picking_sr.file; - inputs[0].files.push_back(f); - } else { - auto& files = inputs[picking_sr.level - start_level].files; - for (auto* f : vstorage->LevelFiles(picking_sr.level)) { - files.push_back(f); - } - } - char file_num_buf[256]; - picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); - ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: size amp picking %s", - cf_name.c_str(), file_num_buf); - } - - // output files at the bottom most level, unless it's reserved - int output_level = vstorage->num_levels() - 1; - // last level is reserved for the files ingested behind - if (ioptions_.allow_ingest_behind) { - assert(output_level > 1); - output_level--; - } - - return new Compaction( - vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level, - MaxFileSizeForLevel(mutable_cf_options, output_level, - kCompactionStyleUniversal), - /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id, - GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level, - 1), - GetCompressionOptions(ioptions_, vstorage, output_level), - /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, - score, false /* deletion_compaction */, - CompactionReason::kUniversalSizeAmplification); + return PickCompactionToOldest(start_index, + CompactionReason::kUniversalSizeAmplification); } // Pick files marked for compaction. Typically, files are marked by // CompactOnDeleteCollector due to the presence of tombstones. -Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, double score, - const std::vector& /*sorted_runs*/, LogBuffer* /*log_buffer*/) { +Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { CompactionInputFiles start_level_inputs; int output_level; std::vector inputs; - if (vstorage->num_levels() == 1) { + if (vstorage_->num_levels() == 1) { // This is single level universal. Since we're basically trying to reclaim // space by processing files marked for compaction due to high tombstone // density, let's do the same thing as compaction to reduce size amp which @@ -803,7 +863,7 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction( start_level_inputs.level = 0; start_level_inputs.files.clear(); output_level = 0; - for (FileMetaData* f : vstorage->LevelFiles(0)) { + for (FileMetaData* f : vstorage_->LevelFiles(0)) { if (f->marked_for_compaction) { compact = true; } @@ -822,24 +882,24 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction( // For multi-level universal, the strategy is to make this look more like // leveled. We pick one of the files marked for compaction and compact with // overlapping files in the adjacent level. - PickFilesMarkedForCompaction(cf_name, vstorage, &start_level, &output_level, - &start_level_inputs); + picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level, + &output_level, &start_level_inputs); if (start_level_inputs.empty()) { return nullptr; } // Pick the first non-empty level after the start_level - for (output_level = start_level + 1; output_level < vstorage->num_levels(); + for (output_level = start_level + 1; output_level < vstorage_->num_levels(); output_level++) { - if (vstorage->NumLevelFiles(output_level) != 0) { + if (vstorage_->NumLevelFiles(output_level) != 0) { break; } } // If all higher levels are empty, pick the highest level as output level - if (output_level == vstorage->num_levels()) { + if (output_level == vstorage_->num_levels()) { if (start_level == 0) { - output_level = vstorage->num_levels() - 1; + output_level = vstorage_->num_levels() - 1; } else { // If start level is non-zero and all higher levels are empty, this // compaction will translate into a trivial move. Since the idea is @@ -849,15 +909,15 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction( } } if (ioptions_.allow_ingest_behind && - output_level == vstorage->num_levels() - 1) { + output_level == vstorage_->num_levels() - 1) { assert(output_level > 1); output_level--; } if (output_level != 0) { if (start_level == 0) { - if (!GetOverlappingL0Files(vstorage, &start_level_inputs, output_level, - nullptr)) { + if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs, + output_level, nullptr)) { return nullptr; } } @@ -866,16 +926,16 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction( int parent_index = -1; output_level_inputs.level = output_level; - if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, - &start_level_inputs, &output_level_inputs, - &parent_index, -1)) { + if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_, + &start_level_inputs, &output_level_inputs, + &parent_index, -1)) { return nullptr; } inputs.push_back(start_level_inputs); if (!output_level_inputs.empty()) { inputs.push_back(output_level_inputs); } - if (FilesRangeOverlapWithCompaction(inputs, output_level)) { + if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) { return nullptr; } } else { @@ -885,23 +945,160 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction( uint64_t estimated_total_size = 0; // Use size of the output level as estimated file size - for (FileMetaData* f : vstorage->LevelFiles(output_level)) { + for (FileMetaData* f : vstorage_->LevelFiles(output_level)) { estimated_total_size += f->fd.GetFileSize(); } uint32_t path_id = - GetPathId(ioptions_, mutable_cf_options, estimated_total_size); + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); return new Compaction( - vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level, - MaxFileSizeForLevel(mutable_cf_options, output_level, + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, kCompactionStyleUniversal), /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id, - GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level, - 1), - GetCompressionOptions(ioptions_, vstorage, output_level), + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, + output_level, 1), + GetCompressionOptions(ioptions_, vstorage_, output_level), /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true, - score, false /* deletion_compaction */, + score_, false /* deletion_compaction */, CompactionReason::kFilesMarkedForCompaction); } + +Compaction* UniversalCompactionBuilder::PickCompactionToOldest( + size_t start_index, CompactionReason compaction_reason) { + assert(start_index < sorted_runs_.size()); + + // Estimate total file size + uint64_t estimated_total_size = 0; + for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + estimated_total_size += sorted_runs_[loop].size; + } + uint32_t path_id = + GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); + int start_level = sorted_runs_[start_index].level; + + std::vector inputs(vstorage_->num_levels()); + for (size_t i = 0; i < inputs.size(); ++i) { + inputs[i].level = start_level + static_cast(i); + } + for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) { + auto& picking_sr = sorted_runs_[loop]; + if (picking_sr.level == 0) { + FileMetaData* f = picking_sr.file; + inputs[0].files.push_back(f); + } else { + auto& files = inputs[picking_sr.level - start_level].files; + for (auto* f : vstorage_->LevelFiles(picking_sr.level)) { + files.push_back(f); + } + } + std::string comp_reason_print_string; + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + comp_reason_print_string = "periodic compaction"; + } else if (compaction_reason == + CompactionReason::kUniversalSizeAmplification) { + comp_reason_print_string = "size amp"; + } else { + assert(false); + } + + char file_num_buf[256]; + picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop); + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s", + cf_name_.c_str(), comp_reason_print_string.c_str(), + file_num_buf); + } + + // output files at the bottom most level, unless it's reserved + int output_level = vstorage_->num_levels() - 1; + // last level is reserved for the files ingested behind + if (ioptions_.allow_ingest_behind) { + assert(output_level > 1); + output_level--; + } + + // We never check size for + // compaction_options_universal.compression_size_percent, + // because we always compact all the files, so always compress. + return new Compaction( + vstorage_, ioptions_, mutable_cf_options_, std::move(inputs), + output_level, + MaxFileSizeForLevel(mutable_cf_options_, output_level, + kCompactionStyleUniversal), + LLONG_MAX, path_id, + GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level, + 1, true /* enable_compression */), + GetCompressionOptions(ioptions_, vstorage_, start_level, + true /* enable_compression */), + /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + score_, false /* deletion_compaction */, compaction_reason); +} + +Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() { + ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction", + cf_name_.c_str()); + + // In universal compaction, sorted runs contain older data are almost always + // generated earlier too. To simplify the problem, we just try to trigger + // a full compaction. We start from the oldest sorted run and include + // all sorted runs, until we hit a sorted already being compacted. + // Since usually the largest (which is usually the oldest) sorted run is + // included anyway, doing a full compaction won't increase write + // amplification much. + + // Get some information from marked files to check whether a file is + // included in the compaction. + + size_t start_index = sorted_runs_.size(); + while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) { + start_index--; + } + if (start_index == sorted_runs_.size()) { + return nullptr; + } + + // There is a rare corner case where we can't pick up all the files + // because some files are being compacted and we end up with picking files + // but none of them need periodic compaction. Unless we simply recompact + // the last sorted run (either the last level or last L0 file), we would just + // execute the compaction, in order to simplify the logic. + if (start_index == sorted_runs_.size() - 1) { + bool included_file_marked = false; + int start_level = sorted_runs_[start_index].level; + FileMetaData* start_file = sorted_runs_[start_index].file; + for (const std::pair& level_file_pair : + vstorage_->FilesMarkedForPeriodicCompaction()) { + if (start_level != 0) { + // Last sorted run is a level + if (start_level == level_file_pair.first) { + included_file_marked = true; + break; + } + } else { + // Last sorted run is a L0 file. + if (start_file == level_file_pair.second) { + included_file_marked = true; + break; + } + } + } + if (!included_file_marked) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Cannot form a compaction covering file " + "marked for periodic compaction", + cf_name_.c_str()); + return nullptr; + } + } + + Compaction* c = PickCompactionToOldest(start_index, + CompactionReason::kPeriodicCompaction); + + TEST_SYNC_POINT_CALLBACK( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", c); + + return c; +} } // namespace rocksdb #endif // !ROCKSDB_LITE diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h new file mode 100644 index 00000000000..150b6bd79c1 --- /dev/null +++ b/db/compaction/compaction_picker_universal.h @@ -0,0 +1,31 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#ifndef ROCKSDB_LITE + +#include "db/compaction/compaction_picker.h" + +namespace rocksdb { +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer, + SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } + + virtual bool NeedsCompaction( + const VersionStorageInfo* vstorage) const override; +}; +} // namespace rocksdb +#endif // !ROCKSDB_LITE diff --git a/db/compaction_picker_universal.h b/db/compaction_picker_universal.h deleted file mode 100644 index 375e5998e25..00000000000 --- a/db/compaction_picker_universal.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#pragma once -#ifndef ROCKSDB_LITE - -#include "db/compaction_picker.h" - -namespace rocksdb { -class UniversalCompactionPicker : public CompactionPicker { - public: - UniversalCompactionPicker(const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icmp) - : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override; - - virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } - - virtual bool NeedsCompaction( - const VersionStorageInfo* vstorage) const override; - - private: - struct SortedRun { - SortedRun(int _level, FileMetaData* _file, uint64_t _size, - uint64_t _compensated_file_size, bool _being_compacted) - : level(_level), - file(_file), - size(_size), - compensated_file_size(_compensated_file_size), - being_compacted(_being_compacted) { - assert(compensated_file_size > 0); - assert(level != 0 || file != nullptr); - } - - void Dump(char* out_buf, size_t out_buf_size, - bool print_path = false) const; - - // sorted_run_count is added into the string to print - void DumpSizeInfo(char* out_buf, size_t out_buf_size, - size_t sorted_run_count) const; - - int level; - // `file` Will be null for level > 0. For level = 0, the sorted run is - // for this file. - FileMetaData* file; - // For level > 0, `size` and `compensated_file_size` are sum of sizes all - // files in the level. `being_compacted` should be the same for all files - // in a non-zero level. Use the value here. - uint64_t size; - uint64_t compensated_file_size; - bool being_compacted; - }; - - // Pick Universal compaction to limit read amplification - Compaction* PickCompactionToReduceSortedRuns( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, double score, unsigned int ratio, - unsigned int num_files, const std::vector& sorted_runs, - LogBuffer* log_buffer); - - // Pick Universal compaction to limit space amplification. - Compaction* PickCompactionToReduceSizeAmp( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, double score, - const std::vector& sorted_runs, LogBuffer* log_buffer); - - Compaction* PickDeleteTriggeredCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - VersionStorageInfo* vstorage, double score, - const std::vector& sorted_runs, LogBuffer* log_buffer); - - // Used in universal compaction when the enabled_trivial_move - // option is set. Checks whether there are any overlapping files - // in the input. Returns true if the input files are non - // overlapping. - bool IsInputFilesNonOverlapping(Compaction* c); - - static std::vector CalculateSortedRuns( - const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options); - - // Pick a path ID to place a newly generated file, with its estimated file - // size. - static uint32_t GetPathId(const ImmutableCFOptions& ioptions, - const MutableCFOptions& mutable_cf_options, - uint64_t file_size); -}; -} // namespace rocksdb -#endif // !ROCKSDB_LITE diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index a7ff587949d..de55c706ab7 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -9,11 +9,11 @@ #include "memtable/stl_wrappers.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/hash.h" #include "util/kv_map.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" using std::unique_ptr; diff --git a/db/convenience.cc b/db/convenience.cc index 71c237f60c0..320d5c6e117 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -8,7 +8,7 @@ #include "rocksdb/convenience.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "util/cast_util.h" namespace rocksdb { @@ -35,6 +35,12 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path) { + return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path); +} +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path) { std::unique_ptr file; uint64_t file_size; InternalKeyComparator internal_comparator(options.comparator); @@ -59,7 +65,8 @@ Status VerifySstFileChecksum(const Options& options, if (!s.ok()) { return s; } - s = table_reader->VerifyChecksum(); + s = table_reader->VerifyChecksum(read_options, + TableReaderCaller::kUserVerifyChecksum); return s; } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 1ccb1aa2b09..4add9b26a2f 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -13,23 +13,24 @@ #include #include -#include #include #include -#include "db/db_impl.h" +#include +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "db/log_format.h" #include "db/version_set.h" +#include "file/filename.h" #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" -#include "table/block_based_table_builder.h" +#include "table/block_based/block_based_table_builder.h" #include "table/meta_blocks.h" -#include "util/filename.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -76,7 +77,11 @@ class CorruptionTest : public testing::Test { delete db_; db_ = nullptr; Options opt = (options ? *options : options_); - opt.env = &env_; + if (opt.env == Options().env) { + // If env is not overridden, replace it with ErrorEnv. + // Otherwise, the test already uses a non-default Env. + opt.env = &env_; + } opt.arena_block_size = 4096; BlockBasedTableOptions table_options; table_options.block_cache = tiny_cache_; @@ -321,6 +326,59 @@ TEST_F(CorruptionTest, TableFile) { ASSERT_NOK(dbi->VerifyChecksum()); } +TEST_F(CorruptionTest, VerifyChecksumReadahead) { + Options options; + SpecialEnv senv(Env::Default()); + options.env = &senv; + // Disable block cache as we are going to check checksum for + // the same file twice and measure number of reads. + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc)); + + Reopen(&options); + + Build(10000); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + dbi->TEST_CompactRange(1, nullptr, nullptr); + + senv.count_random_reads_ = true; + senv.random_read_counter_.Reset(); + ASSERT_OK(dbi->VerifyChecksum()); + + // Make sure the counter is enabled. + ASSERT_GT(senv.random_read_counter_.Read(), 0); + + // The SST file is about 10MB. Default readahead size is 256KB. + // Give a conservative 20 reads for metadata blocks, The number + // of random reads should be within 10 MB / 256KB + 20 = 60. + ASSERT_LT(senv.random_read_counter_.Read(), 60); + + senv.random_read_bytes_counter_ = 0; + ReadOptions ro; + ro.readahead_size = size_t{32 * 1024}; + ASSERT_OK(dbi->VerifyChecksum(ro)); + // The SST file is about 10MB. We set readahead size to 32KB. + // Give 0 to 20 reads for metadata blocks, and allow real read + // to range from 24KB to 48KB. The lower bound would be: + // 10MB / 48KB + 0 = 213 + // The higher bound is + // 10MB / 24KB + 20 = 447. + ASSERT_GE(senv.random_read_counter_.Read(), 213); + ASSERT_LE(senv.random_read_counter_.Read(), 447); + + // Test readahead shouldn't break mmap mode (where it should be + // disabled). + options.allow_mmap_reads = true; + Reopen(&options); + dbi = static_cast(db_); + ASSERT_OK(dbi->VerifyChecksum(ro)); + + CloseDb(); +} + TEST_F(CorruptionTest, TableFileIndexData) { Options options; // very big, we'll trigger flushes manually diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 2d4487ff454..e964377cf9c 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -5,15 +5,15 @@ #ifndef ROCKSDB_LITE -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "table/cuckoo_table_factory.h" -#include "table/cuckoo_table_reader.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_reader.h" #include "table/meta_blocks.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -285,6 +285,9 @@ TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) { TEST_F(CuckooTableDBTest, AdaptiveTable) { Options options = CurrentOptions(); + // Ensure options compatible with PlainTable + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + // Write some keys using cuckoo table. options.table_factory.reset(NewCuckooTableFactory()); Reopen(&options); diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 236a534657f..601033d6125 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -10,9 +10,12 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" -#include "util/fault_injection_test_env.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "test_util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) -#include "util/sync_point.h" +#include "test_util/sync_point.h" #endif namespace rocksdb { @@ -68,6 +71,44 @@ TEST_F(DBBasicTest, ReadOnlyDB) { ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); } +TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) { + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put("bar", "v2")); + ASSERT_OK(Put("foo", "v3")); + Close(); + + auto options = CurrentOptions(); + options.write_dbid_to_manifest = true; + assert(options.env == env_); + ASSERT_OK(ReadOnlyReopen(options)); + std::string db_id1; + db_->GetDbIdentity(db_id1); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + Iterator* iter = db_->NewIterator(ReadOptions()); + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + ++count; + } + ASSERT_EQ(count, 2); + delete iter; + Close(); + + // Reopen and flush memtable. + Reopen(options); + Flush(); + Close(); + // Now check keys in read only mode. + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v3", Get("foo")); + ASSERT_EQ("v2", Get("bar")); + ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); + std::string db_id2; + db_->GetDbIdentity(db_id2); + ASSERT_EQ(db_id1, db_id2); +} + TEST_F(DBBasicTest, CompactedDB) { const uint64_t kFileSize = 1 << 20; Options options = CurrentOptions(); @@ -295,7 +336,7 @@ TEST_F(DBBasicTest, FlushMultipleMemtable) { writeOpt.disableWAL = true; options.max_write_buffer_number = 4; options.min_write_buffer_number_to_merge = 3; - options.max_write_buffer_number_to_maintain = -1; + options.max_write_buffer_size_to_maintain = -1; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); ASSERT_OK(Flush(1)); @@ -325,7 +366,8 @@ TEST_F(DBBasicTest, FlushEmptyColumnFamily) { writeOpt.disableWAL = true; options.max_write_buffer_number = 2; options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 1; + options.max_write_buffer_size_to_maintain = + static_cast(options.write_buffer_size); CreateAndReopenWithCF({"pikachu"}, options); // Compaction can still go through even if no thread can flush the @@ -421,7 +463,7 @@ TEST_F(DBBasicTest, ManifestRollOver) { } while (ChangeCompactOptions()); } -TEST_F(DBBasicTest, IdentityAcrossRestarts) { +TEST_F(DBBasicTest, IdentityAcrossRestarts1) { do { std::string id1; ASSERT_OK(db_->GetDbIdentity(id1)); @@ -433,13 +475,40 @@ TEST_F(DBBasicTest, IdentityAcrossRestarts) { // id1 should match id2 because identity was not regenerated ASSERT_EQ(id1.compare(id2), 0); + std::string idfilename = IdentityFileName(dbname_); + ASSERT_OK(env_->DeleteFile(idfilename)); + Reopen(options); + std::string id3; + ASSERT_OK(db_->GetDbIdentity(id3)); + if (options.write_dbid_to_manifest) { + ASSERT_EQ(id1.compare(id3), 0); + } else { + // id1 should NOT match id3 because identity was regenerated + ASSERT_NE(id1.compare(id3), 0); + } + } while (ChangeCompactOptions()); +} + +TEST_F(DBBasicTest, IdentityAcrossRestarts2) { + do { + std::string id1; + ASSERT_OK(db_->GetDbIdentity(id1)); + + Options options = CurrentOptions(); + options.write_dbid_to_manifest = true; + Reopen(options); + std::string id2; + ASSERT_OK(db_->GetDbIdentity(id2)); + // id1 should match id2 because identity was not regenerated + ASSERT_EQ(id1.compare(id2), 0); + std::string idfilename = IdentityFileName(dbname_); ASSERT_OK(env_->DeleteFile(idfilename)); Reopen(options); std::string id3; ASSERT_OK(db_->GetDbIdentity(id3)); // id1 should NOT match id3 because identity was regenerated - ASSERT_NE(id1.compare(id3), 0); + ASSERT_EQ(id1, id3); } while (ChangeCompactOptions()); } @@ -803,11 +872,12 @@ TEST_F(DBBasicTest, ChecksumTest) { ASSERT_OK(Flush()); } - // verify data with each type of checksum - for (int i = 0; i <= kxxHash64; ++i) { + // with each valid checksum type setting... + for (int i = 0; i <= max_checksum; ++i) { table_options.checksum = static_cast(i); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); + // verify every type of checksum (should be regardless of that setting) for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) { ASSERT_EQ(Key(j), Get(Key(j))); } @@ -845,30 +915,30 @@ TEST_F(DBBasicTest, MmapAndBufferOptions) { class TestEnv : public EnvWrapper { public: - explicit TestEnv() : EnvWrapper(Env::Default()), - close_count(0) { } - - class TestLogger : public Logger { - public: - using Logger::Logv; - TestLogger(TestEnv *env_ptr) : Logger() { env = env_ptr; } - ~TestLogger() override { - if (!closed_) { - CloseHelper(); - } - } - void Logv(const char* /*format*/, va_list /*ap*/) override{}; - - protected: - Status CloseImpl() override { return CloseHelper(); } - - private: - Status CloseHelper() { - env->CloseCountInc();; - return Status::IOError(); - } - TestEnv *env; - }; + explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {} + + class TestLogger : public Logger { + public: + using Logger::Logv; + explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } + ~TestLogger() override { + if (!closed_) { + CloseHelper(); + } + } + void Logv(const char* /*format*/, va_list /*ap*/) override {} + + protected: + Status CloseImpl() override { return CloseHelper(); } + + private: + Status CloseHelper() { + env->CloseCountInc(); + ; + return Status::IOError(); + } + TestEnv* env; + }; void CloseCountInc() { close_count++; } @@ -890,7 +960,8 @@ TEST_F(DBBasicTest, DBClose) { ASSERT_OK(DestroyDB(dbname, options)); DB* db = nullptr; - TestEnv* env = new TestEnv(); + TestEnv* env = new TestEnv(env_); + std::unique_ptr local_env_guard(env); options.create_if_missing = true; options.env = env; Status s = DB::Open(options, dbname, &db); @@ -924,13 +995,11 @@ TEST_F(DBBasicTest, DBClose) { ASSERT_EQ(env->GetCloseCount(), 2); options.info_log.reset(); ASSERT_EQ(env->GetCloseCount(), 3); - - delete options.env; } TEST_F(DBBasicTest, DBCloseFlushError) { std::unique_ptr fault_injection_env( - new FaultInjectionTestEnv(Env::Default())); + new FaultInjectionTestEnv(env_)); Options options = GetDefaultOptions(); options.create_if_missing = true; options.manual_wal_flush = true; @@ -950,15 +1019,27 @@ TEST_F(DBBasicTest, DBCloseFlushError) { Destroy(options); } -TEST_F(DBBasicTest, MultiGetMultiCF) { +class DBMultiGetTestWithParam : public DBBasicTest, + public testing::WithParamInterface {}; + +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}, options); - - for (int i = 0; i < 8; ++i) { - ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", - "cf" + std::to_string(i) + "_val")); + // tuples + std::vector> cf_kv_vec; + static const int num_keys = 24; + cf_kv_vec.reserve(num_keys); + + for (int i = 0; i < num_keys; ++i) { + int cf = i / 3; + int cf_key = 1 % 3; + cf_kv_vec.emplace_back(std::make_tuple( + cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key), + "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key))); + ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), + std::get<2>(cf_kv_vec[i]))); } int get_sv_count = 0; @@ -968,10 +1049,14 @@ TEST_F(DBBasicTest, MultiGetMultiCF) { if (++get_sv_count == 2) { // After MultiGet refs a couple of CFs, flush all CFs so MultiGet // is forced to repeat the process - for (int i = 0; i < 8; ++i) { - ASSERT_OK(Flush(i)); - ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", - "cf" + std::to_string(i) + "_val2")); + for (int i = 0; i < num_keys; ++i) { + int cf = i / 3; + int cf_key = i % 8; + if (cf_key == 0) { + ASSERT_OK(Flush(cf)); + } + ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), + std::get<2>(cf_kv_vec[i]) + "_2")); } } if (get_sv_count == 11) { @@ -989,26 +1074,53 @@ TEST_F(DBBasicTest, MultiGetMultiCF) { std::vector keys; std::vector values; - for (int i = 0; i < 8; ++i) { - cfs.push_back(i); - keys.push_back("cf" + std::to_string(i) + "_key"); + for (int i = 0; i < num_keys; ++i) { + cfs.push_back(std::get<0>(cf_kv_vec[i])); + keys.push_back(std::get<1>(cf_kv_vec[i])); } - values = MultiGet(cfs, keys, nullptr); - ASSERT_EQ(values.size(), 8); + values = MultiGet(cfs, keys, nullptr, GetParam()); + ASSERT_EQ(values.size(), num_keys); for (unsigned int j = 0; j < values.size(); ++j) { - ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val2"); + ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2"); } - for (int i = 0; i < 8; ++i) { + + keys.clear(); + cfs.clear(); + cfs.push_back(std::get<0>(cf_kv_vec[0])); + keys.push_back(std::get<1>(cf_kv_vec[0])); + cfs.push_back(std::get<0>(cf_kv_vec[3])); + keys.push_back(std::get<1>(cf_kv_vec[3])); + cfs.push_back(std::get<0>(cf_kv_vec[4])); + keys.push_back(std::get<1>(cf_kv_vec[4])); + values = MultiGet(cfs, keys, nullptr, GetParam()); + ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2"); + ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2"); + ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2"); + + keys.clear(); + cfs.clear(); + cfs.push_back(std::get<0>(cf_kv_vec[7])); + keys.push_back(std::get<1>(cf_kv_vec[7])); + cfs.push_back(std::get<0>(cf_kv_vec[6])); + keys.push_back(std::get<1>(cf_kv_vec[6])); + cfs.push_back(std::get<0>(cf_kv_vec[1])); + keys.push_back(std::get<1>(cf_kv_vec[1])); + values = MultiGet(cfs, keys, nullptr, GetParam()); + ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2"); + ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2"); + ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2"); + + for (int cf = 0; cf < 8; ++cf) { auto* cfd = reinterpret_cast( - reinterpret_cast(db_)->GetColumnFamilyHandle(i)) + reinterpret_cast(db_)->GetColumnFamilyHandle(cf)) ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete); } } -TEST_F(DBBasicTest, MultiGetMultiCFMutex) { +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}, @@ -1054,7 +1166,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFMutex) { keys.push_back("cf" + std::to_string(i) + "_key"); } - values = MultiGet(cfs, keys, nullptr); + values = MultiGet(cfs, keys, nullptr, GetParam()); ASSERT_TRUE(last_try); ASSERT_EQ(values.size(), 8); for (unsigned int j = 0; j < values.size(); ++j) { @@ -1069,7 +1181,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFMutex) { } } -TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) { +TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}, @@ -1114,7 +1226,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) { } const Snapshot* snapshot = db_->GetSnapshot(); - values = MultiGet(cfs, keys, snapshot); + values = MultiGet(cfs, keys, snapshot, GetParam()); db_->ReleaseSnapshot(snapshot); ASSERT_EQ(values.size(), 8); for (unsigned int j = 0; j < values.size(); ++j) { @@ -1128,6 +1240,9 @@ TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) { } } +INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam, + testing::Bool()); + TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); @@ -1284,10 +1399,712 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { } } } + +// Test class for batched MultiGet with prefix extractor +// Param bool - If true, use partitioned filters +// If false, use full filter block +class MultiGetPrefixExtractorTest : public DBBasicTest, + public ::testing::WithParamInterface { +}; + +TEST_P(MultiGetPrefixExtractorTest, Batched) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(2)); + options.memtable_prefix_bloom_size_ratio = 10; + BlockBasedTableOptions bbto; + if (GetParam()) { + bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + bbto.partition_filters = true; + } + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + bbto.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + Reopen(options); + + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + + // First key is not in the prefix_extractor domain + ASSERT_OK(Put("k", "v0")); + ASSERT_OK(Put("kk1", "v1")); + ASSERT_OK(Put("kk2", "v2")); + ASSERT_OK(Put("kk3", "v3")); + ASSERT_OK(Put("kk4", "v4")); + std::vector mem_keys( + {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"}); + std::vector inmem_values; + inmem_values = MultiGet(mem_keys, nullptr); + ASSERT_EQ(inmem_values[0], "v0"); + ASSERT_EQ(inmem_values[1], "v1"); + ASSERT_EQ(inmem_values[2], "v2"); + ASSERT_EQ(inmem_values[3], "v3"); + ASSERT_EQ(inmem_values[4], "v4"); + ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2); + ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 5); + ASSERT_OK(Flush()); + + std::vector keys({"k", "kk1", "kk2", "kk3", "kk4"}); + std::vector values; + get_perf_context()->Reset(); + values = MultiGet(keys, nullptr); + ASSERT_EQ(values[0], "v0"); + ASSERT_EQ(values[1], "v1"); + ASSERT_EQ(values[2], "v2"); + ASSERT_EQ(values[3], "v3"); + ASSERT_EQ(values[4], "v4"); + // Filter hits for 4 in-domain keys + ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4); +} + +INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest, + ::testing::Bool()); + +#ifndef ROCKSDB_LITE +class DBMultiGetRowCacheTest : public DBBasicTest, + public ::testing::WithParamInterface {}; + +TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) { + do { + option_config_ = kRowCache; + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + SetPerfLevel(kEnableCount); + ASSERT_OK(Put(1, "k1", "v1")); + ASSERT_OK(Put(1, "k2", "v2")); + ASSERT_OK(Put(1, "k3", "v3")); + ASSERT_OK(Put(1, "k4", "v4")); + Flush(1); + ASSERT_OK(Put(1, "k5", "v5")); + const Snapshot* snap1 = dbfull()->GetSnapshot(); + ASSERT_OK(Delete(1, "k4")); + Flush(1); + const Snapshot* snap2 = dbfull()->GetSnapshot(); + + get_perf_context()->Reset(); + + std::vector keys({"no_key", "k5", "k4", "k3", "k1"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + ReadOptions ro; + bool use_snapshots = GetParam(); + if (use_snapshots) { + ro.snapshot = snap2; + } + db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), + s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1"); + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_TRUE(s[0].IsNotFound()); + ASSERT_OK(s[1]); + ASSERT_TRUE(s[2].IsNotFound()); + ASSERT_OK(s[3]); + ASSERT_OK(s[4]); + + // Call MultiGet() again with some intersection with the previous set of + // keys. Those should already be in the row cache. + keys.assign({"no_key", "k5", "k3", "k2"}); + for (size_t i = 0; i < keys.size(); ++i) { + values[i].Reset(); + s[i] = Status::OK(); + } + get_perf_context()->Reset(); + + if (use_snapshots) { + ro.snapshot = snap1; + } + db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(), + values.data(), s.data(), false); + + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2"); + ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); + // four kv pairs * two bytes per value + ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes); + + ASSERT_TRUE(s[0].IsNotFound()); + ASSERT_OK(s[1]); + ASSERT_OK(s[2]); + ASSERT_OK(s[3]); + if (use_snapshots) { + // Only reads from the first SST file would have been cached, since + // snapshot seq no is > fd.largest_seqno + ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT)); + } else { + ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT)); + } + + SetPerfLevel(kDisable); + dbfull()->ReleaseSnapshot(snap1); + dbfull()->ReleaseSnapshot(snap2); + } while (ChangeCompactOptions()); +} + +INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest, + testing::Values(true, false)); + +TEST_F(DBBasicTest, GetAllKeyVersions) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + const size_t kNumInserts = 4; + const size_t kNumDeletes = 4; + const size_t kNumUpdates = 4; + + // Check default column family + for (size_t i = 0; i != kNumInserts; ++i) { + ASSERT_OK(Put(std::to_string(i), "value")); + } + for (size_t i = 0; i != kNumUpdates; ++i) { + ASSERT_OK(Put(std::to_string(i), "value1")); + } + for (size_t i = 0; i != kNumDeletes; ++i) { + ASSERT_OK(Delete(std::to_string(i))); + } + std::vector key_versions; + ASSERT_OK(rocksdb::GetAllKeyVersions(db_, Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); + ASSERT_OK(rocksdb::GetAllKeyVersions(db_, handles_[0], Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); + + // Check non-default column family + for (size_t i = 0; i != kNumInserts - 1; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "value")); + } + for (size_t i = 0; i != kNumUpdates - 1; ++i) { + ASSERT_OK(Put(1, std::to_string(i), "value1")); + } + for (size_t i = 0; i != kNumDeletes - 1; ++i) { + ASSERT_OK(Delete(1, std::to_string(i))); + } + ASSERT_OK(rocksdb::GetAllKeyVersions(db_, handles_[1], Slice(), Slice(), + std::numeric_limits::max(), + &key_versions)); + ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size()); +} +#endif // !ROCKSDB_LITE + +TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { + Options options = CurrentOptions(); + Random rnd(301); + BlockBasedTableOptions table_options; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.block_size = 16 * 1024; + assert(table_options.block_size > + BlockBasedTable::kMultiGetReadStackBufSize); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + + std::string zero_str(128, '\0'); + for (int i = 0; i < 100; ++i) { + // Make the value compressible. A purely random string doesn't compress + // and the resultant data block will not be compressed + std::string value(RandomString(&rnd, 128) + zero_str); + assert(Put(Key(i), value) == Status::OK()); + } + Flush(); + + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); +} + +class DBBasicTestWithParallelIO + : public DBTestBase, + public testing::WithParamInterface> { + public: + DBBasicTestWithParallelIO() : DBTestBase("/db_basic_test_with_parallel_io") { + bool compressed_cache = std::get<0>(GetParam()); + bool uncompressed_cache = std::get<1>(GetParam()); + compression_enabled_ = std::get<2>(GetParam()); + fill_cache_ = std::get<3>(GetParam()); + + if (compressed_cache) { + std::shared_ptr cache = NewLRUCache(1048576); + compressed_cache_ = std::make_shared(cache); + } + if (uncompressed_cache) { + std::shared_ptr cache = NewLRUCache(1048576); + uncompressed_cache_ = std::make_shared(cache); + } + + env_->count_random_reads_ = true; + + Options options = CurrentOptions(); + Random rnd(301); + BlockBasedTableOptions table_options; + +#ifndef ROCKSDB_LITE + if (compression_enabled_) { + std::vector compression_types; + compression_types = GetSupportedCompressions(); + // Not every platform may have compression libraries available, so + // dynamically pick based on what's available + if (compression_types.size() == 0) { + compression_enabled_ = false; + } else { + options.compression = compression_types[0]; + } + } +#else + // GetSupportedCompressions() is not available in LITE build + if (!Snappy_Supported()) { + compression_enabled_ = false; + } +#endif //ROCKSDB_LITE + + table_options.block_cache = uncompressed_cache_; + if (table_options.block_cache == nullptr) { + table_options.no_block_cache = true; + } else { + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + } + table_options.block_cache_compressed = compressed_cache_; + table_options.flush_block_policy_factory.reset( + new MyFlushBlockPolicyFactory()); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + if (!compression_enabled_) { + options.compression = kNoCompression; + } + Reopen(options); + + std::string zero_str(128, '\0'); + for (int i = 0; i < 100; ++i) { + // Make the value compressible. A purely random string doesn't compress + // and the resultant data block will not be compressed + values_.emplace_back(RandomString(&rnd, 128) + zero_str); + assert(Put(Key(i), values_[i]) == Status::OK()); + } + Flush(); + } + + bool CheckValue(int i, const std::string& value) { + if (values_[i].compare(value) == 0) { + return true; + } + return false; + } + + int num_lookups() { return uncompressed_cache_->num_lookups(); } + int num_found() { return uncompressed_cache_->num_found(); } + int num_inserts() { return uncompressed_cache_->num_inserts(); } + + int num_lookups_compressed() { return compressed_cache_->num_lookups(); } + int num_found_compressed() { return compressed_cache_->num_found(); } + int num_inserts_compressed() { return compressed_cache_->num_inserts(); } + + bool fill_cache() { return fill_cache_; } + bool compression_enabled() { return compression_enabled_; } + bool has_compressed_cache() { return compressed_cache_ != nullptr; } + bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; } + + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + private: + class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory { + public: + MyFlushBlockPolicyFactory() {} + + virtual const char* Name() const override { + return "MyFlushBlockPolicyFactory"; + } + + virtual FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& data_block_builder) const override { + return new MyFlushBlockPolicy(data_block_builder); + } + }; + + class MyFlushBlockPolicy : public FlushBlockPolicy { + public: + explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder) + : num_keys_(0), data_block_builder_(data_block_builder) {} + + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (data_block_builder_.empty()) { + // First key in this block + num_keys_ = 1; + return false; + } + // Flush every 10 keys + if (num_keys_ == 10) { + num_keys_ = 1; + return true; + } + num_keys_++; + return false; + } + + private: + int num_keys_; + const BlockBuilder& data_block_builder_; + }; + + class MyBlockCache : public Cache { + public: + explicit MyBlockCache(std::shared_ptr& target) + : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {} + + virtual const char* Name() const override { return "MyBlockCache"; } + + virtual Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + num_inserts_++; + return target_->Insert(key, value, charge, deleter, handle, priority); + } + + virtual Handle* Lookup(const Slice& key, + Statistics* stats = nullptr) override { + num_lookups_++; + Handle* handle = target_->Lookup(key, stats); + if (handle != nullptr) { + num_found_++; + } + return handle; + } + + virtual bool Ref(Handle* handle) override { return target_->Ref(handle); } + + virtual bool Release(Handle* handle, bool force_erase = false) override { + return target_->Release(handle, force_erase); + } + + virtual void* Value(Handle* handle) override { + return target_->Value(handle); + } + + virtual void Erase(const Slice& key) override { target_->Erase(key); } + virtual uint64_t NewId() override { return target_->NewId(); } + + virtual void SetCapacity(size_t capacity) override { + target_->SetCapacity(capacity); + } + + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override { + target_->SetStrictCapacityLimit(strict_capacity_limit); + } + + virtual bool HasStrictCapacityLimit() const override { + return target_->HasStrictCapacityLimit(); + } + + virtual size_t GetCapacity() const override { + return target_->GetCapacity(); + } + + virtual size_t GetUsage() const override { return target_->GetUsage(); } + + virtual size_t GetUsage(Handle* handle) const override { + return target_->GetUsage(handle); + } + + virtual size_t GetPinnedUsage() const override { + return target_->GetPinnedUsage(); + } + + virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; } + + virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t), + bool thread_safe) override { + return target_->ApplyToAllCacheEntries(callback, thread_safe); + } + + virtual void EraseUnRefEntries() override { + return target_->EraseUnRefEntries(); + } + + int num_lookups() { return num_lookups_; } + + int num_found() { return num_found_; } + + int num_inserts() { return num_inserts_; } + + private: + std::shared_ptr target_; + int num_lookups_; + int num_found_; + int num_inserts_; + }; + + std::shared_ptr compressed_cache_; + std::shared_ptr uncompressed_cache_; + bool compression_enabled_; + std::vector values_; + bool fill_cache_; +}; + +TEST_P(DBBasicTestWithParallelIO, MultiGet) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + ASSERT_TRUE(CheckValue(50, values[1].ToString())); + + int random_reads = env_->random_read_counter_.Read(); + key_data[0] = Key(1); + key_data[1] = Key(51); + keys[0] = Slice(key_data[0]); + keys[1] = Slice(key_data[1]); + values[0].Reset(); + values[1].Reset(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(1, values[0].ToString())); + ASSERT_TRUE(CheckValue(51, values[1].ToString())); + + bool read_from_cache = false; + if (fill_cache()) { + if (has_uncompressed_cache()) { + read_from_cache = true; + } else if (has_compressed_cache() && compression_enabled()) { + read_from_cache = true; + } + } + + int expected_reads = random_reads + (read_from_cache ? 0 : 2); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); + + keys.resize(10); + statuses.resize(10); + std::vector key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85}; + for (size_t i = 0; i < key_ints.size(); ++i) { + key_data[i] = Key(key_ints[i]); + keys[i] = Slice(key_data[i]); + statuses[i] = Status::OK(); + values[i].Reset(); + } + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + for (size_t i = 0; i < key_ints.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString())); + } + expected_reads += (read_from_cache ? 2 : 4); + ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); +} + +INSTANTIATE_TEST_CASE_P( + ParallelIO, DBBasicTestWithParallelIO, + // Params are as follows - + // Param 0 - Compressed cache enabled + // Param 1 - Uncompressed cache enabled + // Param 2 - Data compression enabled + // Param 3 - ReadOptions::fill_cache + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Bool())); + +class DBBasicTestWithTimestampWithParam + : public DBTestBase, + public testing::WithParamInterface { + public: + DBBasicTestWithTimestampWithParam() + : DBTestBase("/db_basic_test_with_timestamp") {} + + protected: + class TestComparator : public Comparator { + private: + const Comparator* cmp_without_ts_; + + public: + explicit TestComparator(size_t ts_sz) + : Comparator(ts_sz), cmp_without_ts_(nullptr) { + cmp_without_ts_ = BytewiseComparator(); + } + + const char* Name() const override { return "TestComparator"; } + + void FindShortSuccessor(std::string*) const override {} + + void FindShortestSeparator(std::string*, const Slice&) const override {} + + int Compare(const Slice& a, const Slice& b) const override { + int r = CompareWithoutTimestamp(a, b); + if (r != 0 || 0 == timestamp_size()) { + return r; + } + return CompareTimestamp( + Slice(a.data() + a.size() - timestamp_size(), timestamp_size()), + Slice(b.data() + b.size() - timestamp_size(), timestamp_size())); + } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override { + assert(a.size() >= timestamp_size()); + assert(b.size() >= timestamp_size()); + Slice k1 = StripTimestampFromUserKey(a, timestamp_size()); + Slice k2 = StripTimestampFromUserKey(b, timestamp_size()); + + return cmp_without_ts_->Compare(k1, k2); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + if (!ts1.data() && !ts2.data()) { + return 0; + } else if (ts1.data() && !ts2.data()) { + return 1; + } else if (!ts1.data() && ts2.data()) { + return -1; + } + assert(ts1.size() == ts2.size()); + uint64_t low1 = 0; + uint64_t low2 = 0; + uint64_t high1 = 0; + uint64_t high2 = 0; + auto* ptr1 = const_cast(&ts1); + auto* ptr2 = const_cast(&ts2); + if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) || + !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) { + assert(false); + } + if (high1 < high2) { + return 1; + } else if (high1 > high2) { + return -1; + } + if (low1 < low2) { + return 1; + } else if (low1 > low2) { + return -1; + } + return 0; + } + }; + + Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) { + assert(nullptr != ts); + ts->clear(); + PutFixed64(ts, low); + PutFixed64(ts, high); + assert(ts->size() == sizeof(low) + sizeof(high)); + return Slice(*ts); + } +}; + +TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) { + const int kNumKeysPerFile = 8192; + const size_t kNumTimestamps = 6; + bool memtable_only = GetParam(); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.env = env_; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + std::string tmp; + size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size(); + TestComparator test_cmp(ts_sz); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy( + 10 /*bits_per_key*/, false /*use_block_based_builder*/)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + std::vector write_ts_strs(kNumTimestamps); + std::vector read_ts_strs(kNumTimestamps); + std::vector write_ts_list; + std::vector read_ts_list; + + for (size_t i = 0; i != kNumTimestamps; ++i) { + write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i])); + read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i])); + const Slice& write_ts = write_ts_list.back(); + WriteOptions wopts; + wopts.timestamp = &write_ts; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + ASSERT_OK(Put(cf, "key" + std::to_string(j), + "value_" + std::to_string(j) + "_" + std::to_string(i), + wopts)); + } + if (!memtable_only) { + ASSERT_OK(Flush(cf)); + } + } + } + const auto& verify_db_func = [&]() { + for (size_t i = 0; i != kNumTimestamps; ++i) { + ReadOptions ropts; + ropts.timestamp = &read_ts_list[i]; + for (int cf = 0; cf != static_cast(num_cfs); ++cf) { + ColumnFamilyHandle* cfh = handles_[cf]; + for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) { + std::string value; + ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value)); + ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), + value); + } + } + } + }; + verify_db_func(); +} + +INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam, + ::testing::Bool()); + } // namespace rocksdb +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + int main(int argc, char** argv) { rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/db_blob_index_test.cc b/db/db_blob_index_test.cc index 005a23d63b7..30e44e5bac0 100644 --- a/db/db_blob_index_test.cc +++ b/db/db_blob_index_test.cc @@ -12,6 +12,7 @@ #include #include +#include "db/arena_wrapped_db_iter.h" #include "db/column_family.h" #include "db/db_iter.h" #include "db/db_test_util.h" @@ -63,9 +64,11 @@ class DBBlobIndexTest : public DBTestBase { ReadOptions read_options; read_options.snapshot = snapshot; PinnableSlice value; - auto s = dbfull()->GetImpl(read_options, cfh(), key, &value, - nullptr /*value_found*/, nullptr /*callback*/, - is_blob_index); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = cfh(); + get_impl_options.value = &value; + get_impl_options.is_blob_index = is_blob_index; + auto s = dbfull()->GetImpl(read_options, key, get_impl_options); if (s.IsNotFound()) { return "NOT_FOUND"; } @@ -395,6 +398,29 @@ TEST_F(DBBlobIndexTest, Iterate) { verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), create_blob_iterator, check_is_blob(false)); +#ifndef ROCKSDB_LITE + // Iterator with blob support and using seek. + ASSERT_OK(dbfull()->SetOptions( + cfh(), {{"max_sequential_skip_in_iterations", "0"}})); + verify(1, Status::kOk, get_value(1, 0), get_value(1, 0), + create_blob_iterator, check_is_blob(true)); + verify(3, Status::kOk, get_value(3, 0), get_value(3, 0), + create_blob_iterator, check_is_blob(true)); + verify(5, Status::kOk, get_value(5, 0), get_value(5, 0), + create_blob_iterator, check_is_blob(false)); + verify(7, Status::kOk, get_value(8, 0), get_value(6, 0), + create_blob_iterator, check_is_blob(false)); + verify(9, Status::kOk, get_value(10, 0), get_value(8, 0), + create_blob_iterator, check_is_blob(false)); + verify(11, Status::kNotSupported, "", "", create_blob_iterator); + verify(13, Status::kOk, + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0), + create_blob_iterator, check_is_blob(false)); + verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), + create_blob_iterator, check_is_blob(false)); +#endif // !ROCKSDB_LITE + for (auto* snapshot : snapshots) { dbfull()->ReleaseSnapshot(snapshot); } diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index f6e1aad323c..89c2dbd5d16 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -19,6 +19,9 @@ class DBBlockCacheTest : public DBTestBase { size_t hit_count_ = 0; size_t insert_count_ = 0; size_t failure_count_ = 0; + size_t compression_dict_miss_count_ = 0; + size_t compression_dict_hit_count_ = 0; + size_t compression_dict_insert_count_ = 0; size_t compressed_miss_count_ = 0; size_t compressed_hit_count_ = 0; size_t compressed_insert_count_ = 0; @@ -69,6 +72,15 @@ class DBBlockCacheTest : public DBTestBase { TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); } + void RecordCacheCountersForCompressionDict(const Options& options) { + compression_dict_miss_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + compression_dict_hit_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT); + compression_dict_insert_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD); + } + void CheckCacheCounters(const Options& options, size_t expected_misses, size_t expected_hits, size_t expected_inserts, size_t expected_failures) { @@ -87,6 +99,28 @@ class DBBlockCacheTest : public DBTestBase { failure_count_ = new_failure_count; } + void CheckCacheCountersForCompressionDict( + const Options& options, size_t expected_compression_dict_misses, + size_t expected_compression_dict_hits, + size_t expected_compression_dict_inserts) { + size_t new_compression_dict_miss_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS); + size_t new_compression_dict_hit_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT); + size_t new_compression_dict_insert_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD); + ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses, + new_compression_dict_miss_count); + ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits, + new_compression_dict_hit_count); + ASSERT_EQ( + compression_dict_insert_count_ + expected_compression_dict_inserts, + new_compression_dict_insert_count); + compression_dict_miss_count_ = new_compression_dict_miss_count; + compression_dict_hit_count_ = new_compression_dict_hit_count; + compression_dict_insert_count_ = new_compression_dict_insert_count; + } + void CheckCompressedCacheCounters(const Options& options, size_t expected_misses, size_t expected_hits, @@ -346,8 +380,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { options.statistics = rocksdb::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; + LRUCacheOptions co; // 500 bytes are enough to hold the first two blocks - std::shared_ptr cache = NewLRUCache(500, 0, false); + co.capacity = 500; + co.num_shard_bits = 0; + co.strict_capacity_limit = false; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); table_options.block_cache = cache; table_options.filter_policy.reset(NewBloomFilterPolicy(20, true)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); @@ -365,8 +404,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); // set the cache capacity to the current usage cache->SetCapacity(index_bytes_insert + filter_bytes_insert); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can + // bring the stats back. + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); // Note that the second key needs to be no longer than the first one. // Otherwise the second index block may not fit in cache. ASSERT_OK(Put(1, "key", "val")); @@ -377,10 +419,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { index_bytes_insert); ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), filter_bytes_insert); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), - index_bytes_insert); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), - filter_bytes_insert); + // The index and filter eviction statistics were broken by the refactoring + // that moved the readers out of the block cache. Disabling these until we can + // bring the stats back. + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), + // index_bytes_insert); + // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), + // filter_bytes_insert); } namespace { @@ -444,11 +489,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) { TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); if (priority == Cache::Priority::LOW) { - ASSERT_EQ(0, MockCache::high_pri_insert_count); - ASSERT_EQ(2, MockCache::low_pri_insert_count); + ASSERT_EQ(0u, MockCache::high_pri_insert_count); + ASSERT_EQ(2u, MockCache::low_pri_insert_count); } else { - ASSERT_EQ(2, MockCache::high_pri_insert_count); - ASSERT_EQ(0, MockCache::low_pri_insert_count); + ASSERT_EQ(2u, MockCache::high_pri_insert_count); + ASSERT_EQ(0u, MockCache::low_pri_insert_count); } // Access data block. @@ -462,11 +507,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) { // Data block should be inserted with low priority. if (priority == Cache::Priority::LOW) { - ASSERT_EQ(0, MockCache::high_pri_insert_count); - ASSERT_EQ(3, MockCache::low_pri_insert_count); + ASSERT_EQ(0u, MockCache::high_pri_insert_count); + ASSERT_EQ(3u, MockCache::low_pri_insert_count); } else { - ASSERT_EQ(2, MockCache::high_pri_insert_count); - ASSERT_EQ(1, MockCache::low_pri_insert_count); + ASSERT_EQ(2u, MockCache::high_pri_insert_count); + ASSERT_EQ(1u, MockCache::low_pri_insert_count); } } } @@ -665,6 +710,8 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { options.table_factory.reset(new BlockBasedTableFactory(table_options)); DestroyAndReopen(options); + RecordCacheCountersForCompressionDict(options); + for (int i = 0; i < kNumFiles; ++i) { ASSERT_EQ(i, NumTableFilesAtLevel(0, 0)); for (int j = 0; j < kNumEntriesPerFile; ++j) { @@ -677,27 +724,26 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { ASSERT_EQ(0, NumTableFilesAtLevel(0)); ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1)); + // Compression dictionary blocks are preloaded. + CheckCacheCountersForCompressionDict( + options, kNumFiles /* expected_compression_dict_misses */, + 0 /* expected_compression_dict_hits */, + kNumFiles /* expected_compression_dict_inserts */); + // Seek to a key in a file. It should cause the SST's dictionary meta-block // to be read. RecordCacheCounters(options); - ASSERT_EQ(0, - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); - ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD)); - ASSERT_EQ( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - 0); + RecordCacheCountersForCompressionDict(options); ReadOptions read_options; ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1))); - // Two blocks missed/added: dictionary and data block - // One block hit: index since it's prefetched - CheckCacheCounters(options, 2 /* expected_misses */, 1 /* expected_hits */, - 2 /* expected_inserts */, 0 /* expected_failures */); - ASSERT_EQ(1, - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS)); - ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD)); - ASSERT_GT( - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), - 0); + // Two block hits: index and dictionary since they are prefetched + // One block missed/added: data block + CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */, + 1 /* expected_inserts */, 0 /* expected_failures */); + CheckCacheCountersForCompressionDict( + options, 0 /* expected_compression_dict_misses */, + 1 /* expected_compression_dict_hits */, + 0 /* expected_compression_dict_inserts */); } } diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index a2a01d6b4cf..b31c935e85c 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -10,9 +10,20 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" +#include "table/block_based/filter_policy_internal.h" namespace rocksdb { +namespace { +using BFP = BloomFilterPolicy; + +namespace BFP2 { +// Extends BFP::Mode with option to use Plain table +using PseudoMode = int; +static constexpr PseudoMode kPlainTable = -1; +} // namespace BFP2 +} // namespace + // DB tests related to bloom filter. class DBBloomFilterTest : public DBTestBase { @@ -20,12 +31,12 @@ class DBBloomFilterTest : public DBTestBase { DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {} }; -class DBBloomFilterTestWithParam - : public DBTestBase, - public testing::WithParamInterface> { +class DBBloomFilterTestWithParam : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { // public testing::WithParamInterface { protected: - bool use_block_based_filter_; + BFP::Mode bfp_impl_; bool partition_filters_; uint32_t format_version_; @@ -35,7 +46,7 @@ class DBBloomFilterTestWithParam ~DBBloomFilterTestWithParam() override {} void SetUp() override { - use_block_based_filter_ = std::get<0>(GetParam()); + bfp_impl_ = std::get<0>(GetParam()); partition_filters_ = std::get<1>(GetParam()); format_version_ = std::get<2>(GetParam()); } @@ -71,8 +82,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) { ReadOptions ropts; std::string value; anon::OptionsOverride options_override; - options_override.filter_policy.reset( - NewBloomFilterPolicy(20, use_block_based_filter_)); + options_override.filter_policy.reset(new BFP(20, bfp_impl_)); options_override.partition_filters = partition_filters_; options_override.metadata_block_size = 32; Options options = CurrentOptions(options_override); @@ -432,8 +442,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { // trigger reset of table_factory BlockBasedTableOptions table_options; table_options.no_block_cache = true; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_filter_)); + table_options.filter_policy.reset(new BFP(10, bfp_impl_)); table_options.partition_filters = partition_filters_; if (partition_filters_) { table_options.index_type = @@ -502,24 +511,27 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { #ifndef ROCKSDB_VALGRIND_RUN INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestDefFormatVersion, - ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion), - std::make_tuple(false, true, test::kDefaultFormatVersion), - std::make_tuple(false, false, - test::kDefaultFormatVersion))); + ::testing::Values( + std::make_tuple(BFP::kDeprecatedBlock, false, + test::kDefaultFormatVersion), + std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatDef, DBBloomFilterTestWithParam, - ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion), - std::make_tuple(false, true, test::kDefaultFormatVersion), - std::make_tuple(false, false, - test::kDefaultFormatVersion))); + ::testing::Values( + std::make_tuple(BFP::kDeprecatedBlock, false, + test::kDefaultFormatVersion), + std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion), + std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion))); INSTANTIATE_TEST_CASE_P( FormatLatest, DBBloomFilterTestWithParam, - ::testing::Values(std::make_tuple(true, false, test::kLatestFormatVersion), - std::make_tuple(false, true, test::kLatestFormatVersion), - std::make_tuple(false, false, - test::kLatestFormatVersion))); + ::testing::Values( + std::make_tuple(BFP::kDeprecatedBlock, false, + test::kLatestFormatVersion), + std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion), + std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion))); #endif // ROCKSDB_VALGRIND_RUN TEST_F(DBBloomFilterTest, BloomFilterRate) { @@ -636,15 +648,17 @@ TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) { } namespace { -// A wrapped bloom over default FilterPolicy -class WrappedBloom : public FilterPolicy { +// A wrapped bloom over block-based FilterPolicy +class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy { public: - explicit WrappedBloom(int bits_per_key) - : filter_(NewBloomFilterPolicy(bits_per_key)), counter_(0) {} + explicit TestingWrappedBlockBasedFilterPolicy(int bits_per_key) + : filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {} - ~WrappedBloom() override { delete filter_; } + ~TestingWrappedBlockBasedFilterPolicy() override { delete filter_; } - const char* Name() const override { return "WrappedRocksDbFilterPolicy"; } + const char* Name() const override { + return "TestingWrappedBlockBasedFilterPolicy"; + } void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst) const override { @@ -671,12 +685,13 @@ class WrappedBloom : public FilterPolicy { }; } // namespace -TEST_F(DBBloomFilterTest, BloomFilterWrapper) { +TEST_F(DBBloomFilterTest, WrappedBlockBasedFilterPolicy) { Options options = CurrentOptions(); options.statistics = rocksdb::CreateDBStatistics(); BlockBasedTableOptions table_options; - WrappedBloom* policy = new WrappedBloom(10); + TestingWrappedBlockBasedFilterPolicy* policy = + new TestingWrappedBlockBasedFilterPolicy(10); table_options.filter_policy.reset(policy); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -706,6 +721,166 @@ TEST_F(DBBloomFilterTest, BloomFilterWrapper) { ASSERT_EQ(2U * maxKey, policy->GetCounter()); } +namespace { +// NOTE: This class is referenced by HISTORY.md as a model for a wrapper +// FilterPolicy selecting among configurations based on context. +class LevelAndStyleCustomFilterPolicy : public FilterPolicy { + public: + explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other, + int bpk_otherwise) + : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)), + policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)), + policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {} + + // OK to use built-in policy name because we are deferring to a + // built-in builder. We aren't changing the serialized format. + const char* Name() const override { return policy_fifo_->Name(); } + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override { + if (context.compaction_style == kCompactionStyleFIFO) { + return policy_fifo_->GetBuilderWithContext(context); + } else if (context.level_at_creation == 0) { + return policy_l0_other_->GetBuilderWithContext(context); + } else { + return policy_otherwise_->GetBuilderWithContext(context); + } + } + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { + // OK to defer to any of them; they all can parse built-in filters + // from any settings. + return policy_fifo_->GetFilterBitsReader(contents); + } + + // Defer just in case configuration uses block-based filter + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + policy_otherwise_->CreateFilter(keys, n, dst); + } + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + return policy_otherwise_->KeyMayMatch(key, filter); + } + + private: + const std::unique_ptr policy_fifo_; + const std::unique_ptr policy_l0_other_; + const std::unique_ptr policy_otherwise_; +}; + +class TestingContextCustomFilterPolicy + : public LevelAndStyleCustomFilterPolicy { + public: + explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other, + int bpk_otherwise) + : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) { + } + + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext& context) const override { + test_report_ += "cf="; + test_report_ += context.column_family_name; + test_report_ += ",cs="; + test_report_ += + OptionsHelper::compaction_style_to_string[context.compaction_style]; + test_report_ += ",lv="; + test_report_ += std::to_string(context.level_at_creation); + test_report_ += "\n"; + + return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context); + } + + std::string DumpTestReport() { + std::string rv; + std::swap(rv, test_report_); + return rv; + } + + private: + mutable std::string test_report_; +}; +} // namespace + +TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { + for (bool fifo : {true, false}) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + options.compaction_style = + fifo ? kCompactionStyleFIFO : kCompactionStyleLevel; + + BlockBasedTableOptions table_options; + auto policy = std::make_shared(15, 8, 5); + table_options.filter_policy = policy; + table_options.format_version = 5; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey / 2; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + EXPECT_EQ(policy->DumpTestReport(), + fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n" + : "cf=bob,cs=kCompactionStyleLevel,lv=0\n"); + + for (int i = maxKey / 2; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + Flush(1); + EXPECT_EQ(policy->DumpTestReport(), + fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n" + : "cf=bob,cs=kCompactionStyleLevel,lv=0\n"); + + // Check that they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + // Since we have two tables / two filters, we might have Bloom checks on + // our queries, but no more than one "useful" per query on a found key. + EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey); + + // Check that we have two filters, each about + // fifo: 0.12% FP rate (15 bits per key) + // level: 2.3% FP rate (8 bits per key) + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + { + auto useful_count = + TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975)); + EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98)); + } + + if (!fifo) { // FIFO only has L0 + // Full compaction + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, + nullptr)); + EXPECT_EQ(policy->DumpTestReport(), + "cf=bob,cs=kCompactionStyleLevel,lv=1\n"); + + // Check that we now have one filter, about 9.2% FP rate (5 bits per key) + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); + } + { + auto useful_count = + TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + EXPECT_GE(useful_count, maxKey * 0.90); + EXPECT_LE(useful_count, maxKey * 0.91); + } + } + + // Destroy + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + dbfull()->DestroyColumnFamilyHandle(handles_[1]); + handles_[1] = nullptr; + } +} + class SliceTransformLimitedDomain : public SliceTransform { const char* Name() const override { return "SliceTransformLimitedDomain"; } @@ -858,33 +1033,32 @@ TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) { #ifndef ROCKSDB_LITE class BloomStatsTestWithParam : public DBBloomFilterTest, - public testing::WithParamInterface> { + public testing::WithParamInterface> { public: BloomStatsTestWithParam() { - use_block_table_ = std::get<0>(GetParam()); - use_block_based_builder_ = std::get<1>(GetParam()); - partition_filters_ = std::get<2>(GetParam()); + bfp_impl_ = std::get<0>(GetParam()); + partition_filters_ = std::get<1>(GetParam()); options_.create_if_missing = true; options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4)); options_.memtable_prefix_bloom_size_ratio = 8.0 * 1024.0 / static_cast(options_.write_buffer_size); - if (use_block_table_) { + if (bfp_impl_ == BFP2::kPlainTable) { + assert(!partition_filters_); // not supported in plain table + PlainTableOptions table_options; + options_.table_factory.reset(NewPlainTableFactory(table_options)); + } else { BlockBasedTableOptions table_options; table_options.hash_index_allow_collision = false; if (partition_filters_) { - assert(!use_block_based_builder_); + assert(bfp_impl_ != BFP::kDeprecatedBlock); table_options.partition_filters = partition_filters_; table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder_)); + new BFP(10, static_cast(bfp_impl_))); options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); - } else { - assert(!partition_filters_); // not supported in plain table - PlainTableOptions table_options; - options_.table_factory.reset(NewPlainTableFactory(table_options)); } options_.env = env_; @@ -901,8 +1075,7 @@ class BloomStatsTestWithParam static void SetUpTestCase() {} static void TearDownTestCase() {} - bool use_block_table_; - bool use_block_based_builder_; + BFP2::PseudoMode bfp_impl_; bool partition_filters_; Options options_; }; @@ -1006,7 +1179,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_EQ(value3, iter->value().ToString()); // The seek doesn't check block-based bloom filter because last index key // starts with the same prefix we're seeking to. - uint64_t expected_hits = use_block_based_builder_ ? 1 : 2; + uint64_t expected_hits = bfp_impl_ == BFP::kDeprecatedBlock ? 1 : 2; ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); iter->Seek(key2); @@ -1016,12 +1189,14 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count); } -INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam, - ::testing::Values(std::make_tuple(true, true, false), - std::make_tuple(true, false, false), - std::make_tuple(true, false, true), - std::make_tuple(false, false, - false))); +INSTANTIATE_TEST_CASE_P( + BloomStatsTestWithParam, BloomStatsTestWithParam, + ::testing::Values(std::make_tuple(BFP::kDeprecatedBlock, false), + std::make_tuple(BFP::kLegacyBloom, false), + std::make_tuple(BFP::kLegacyBloom, true), + std::make_tuple(BFP::kFastLocalBloom, false), + std::make_tuple(BFP::kFastLocalBloom, true), + std::make_tuple(BFP2::kPlainTable, false))); namespace { void PrefixScanInit(DBBloomFilterTest* dbtest) { @@ -1095,6 +1270,8 @@ TEST_F(DBBloomFilterTest, PrefixScan) { options.max_background_compactions = 2; options.create_if_missing = true; options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + assert(!options.unordered_write); + // It is incompatible with allow_concurrent_memtable_write=false options.allow_concurrent_memtable_write = false; BlockBasedTableOptions table_options; @@ -1145,6 +1322,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.optimize_filters_for_hits = true; options.statistics = rocksdb::CreateDBStatistics(); + get_perf_context()->Reset(); get_perf_context()->EnablePerLevelPerfContext(); CreateAndReopenWithCF({"mypikachu"}, options); @@ -1326,8 +1504,8 @@ int CountIter(std::unique_ptr& iter, const Slice& key) { // into the same string, or 2) the transformed seek key is of the same length // as the upper bound and two keys are adjacent according to the comparator. TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { - int iteration = 0; - for (bool use_block_based_builder : {true, false}) { + for (auto bfp_impl : BFP::kAllFixedImpls) { + int using_full_builder = bfp_impl != BFP::kDeprecatedBlock; Options options; options.create_if_missing = true; options.prefix_extractor.reset(NewCappedPrefixTransform(4)); @@ -1336,8 +1514,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.filter_policy.reset(new BFP(10, bfp_impl)); table_options.index_shortening = BlockBasedTableOptions:: IndexShorteningMode::kShortenSeparatorsAndSuccessor; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -1380,7 +1557,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); // should check bloom filter since upper bound meets requirement ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration); + 2 + using_full_builder); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { @@ -1394,7 +1571,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); // should skip bloom filter since upper bound is too long ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration); + 2 + using_full_builder); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { @@ -1408,7 +1585,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { // should check bloom filter since upper bound matches transformed seek // key ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration * 2); + 2 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } { @@ -1422,7 +1599,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); // should skip bloom filter since mismatch is found ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration * 2); + 2 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); @@ -1436,7 +1613,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration * 2); + 2 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}})); @@ -1449,18 +1626,17 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 3 + iteration * 2); + 3 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); } - iteration++; } } // Create multiple SST files each with a different prefix_extractor config, // verify iterators can read all SST files using the latest config. TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { - int iteration = 0; - for (bool use_block_based_builder : {true, false}) { + for (auto bfp_impl : BFP::kAllFixedImpls) { + int using_full_builder = bfp_impl != BFP::kDeprecatedBlock; Options options; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -1468,8 +1644,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { options.statistics = CreateDBStatistics(); // Enable prefix bloom for SST files BlockBasedTableOptions table_options; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.filter_policy.reset(new BFP(10, bfp_impl)); table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -1495,10 +1670,10 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "foo"), 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 1 + iteration); + 1 + using_full_builder); ASSERT_EQ(CountIter(iter, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 1 + iteration); + 1 + using_full_builder); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); // second SST with capped:3 BF @@ -1512,13 +1687,13 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { std::unique_ptr iter_tmp(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 2 + iteration * 2); + 2 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); // both counters are incremented because BF is "not changed" for 1 of the // 2 SST files, so filter is checked once and found no match. ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 3 + iteration * 2); + 3 + using_full_builder * 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); } @@ -1537,24 +1712,24 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); // the first and last BF are checked ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 4 + iteration * 3); + 4 + using_full_builder * 3); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); // only last BF is checked and not found ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 5 + iteration * 3); + 5 + using_full_builder * 3); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); } // iter_old can only see the first SST, so checked plus 1 ASSERT_EQ(CountIter(iter_old, "foo"), 4); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 6 + iteration * 3); + 6 + using_full_builder * 3); // iter was created after the first setoptions call so only full filter // will check the filter ASSERT_EQ(CountIter(iter, "foo"), 2); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 6 + iteration * 4); + 6 + using_full_builder * 4); { // keys in all three SSTs are visible to iterator @@ -1563,11 +1738,11 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { std::unique_ptr iter_all(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_all, "foo"), 9); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 7 + iteration * 5); + 7 + using_full_builder * 5); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); ASSERT_EQ(CountIter(iter_all, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 8 + iteration * 5); + 8 + using_full_builder * 5); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); @@ -1579,15 +1754,14 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { // all three SST are checked because the current options has the same as // the remaining SST (capped:3) ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 9 + iteration * 7); + 9 + using_full_builder * 7); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); ASSERT_EQ(CountIter(iter_all, "gpk"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), - 10 + iteration * 7); + 10 + using_full_builder * 7); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); } // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? - iteration++; } } @@ -1596,7 +1770,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { // as expected TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { int iteration = 0; - for (bool use_block_based_builder : {true, false}) { + for (auto bfp_impl : BFP::kAllFixedImpls) { Options options = CurrentOptions(); options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -1605,8 +1779,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.filter_policy.reset(new BFP(10, bfp_impl)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options); ReadOptions read_options; @@ -1655,8 +1828,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { // Verify it's possible to change prefix_extractor at runtime and iterators // behaves as expected TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { - int iteration = 0; - for (bool use_block_based_builder : {true, false}) { + for (auto bfp_impl : BFP::kAllFixedImpls) { Options options; options.create_if_missing = true; options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -1665,8 +1837,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { // Enable prefix bloom for SST files BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; - table_options.filter_policy.reset( - NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.filter_policy.reset(new BFP(10, bfp_impl)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -1717,7 +1888,6 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { ASSERT_EQ(CountIter(iter_old, "abc"), 0); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); - iteration++; } } diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 37e80048e6d..90fa1e88337 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -720,7 +720,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { cfilter_count = 0; ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // The filter should delete 40 records. - ASSERT_EQ(40U, cfilter_count); + ASSERT_EQ(40, cfilter_count); { // Scan the entire database as of the snapshot to ensure diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index b5033b66f0c..84f9f55dd7b 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -12,10 +12,11 @@ #include "port/stack_trace.h" #include "rocksdb/concurrent_task_limiter.h" #include "rocksdb/experimental.h" +#include "rocksdb/sst_file_writer.h" #include "rocksdb/utilities/convenience.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" #include "util/concurrent_task_limiter_impl.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" namespace rocksdb { @@ -141,7 +142,7 @@ Options DeletionTriggerOptions(Options options) { options.compression = kNoCompression; options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24); options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 0; + options.max_write_buffer_size_to_maintain = 0; options.num_levels = kCDTNumLevels; options.level0_file_num_compaction_trigger = 1; options.target_file_size_base = options.write_buffer_size * 2; @@ -497,14 +498,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { // Create new iterator for: // (1) 1 for verifying flush results - // (2) 3 for compaction input files - // (3) 1 for verifying compaction results. - ASSERT_EQ(num_new_table_reader, 5); + // (2) 1 for verifying compaction results. + // (3) New TableReaders will not be created for compaction inputs + ASSERT_EQ(num_new_table_reader, 2); num_table_cache_lookup = 0; num_new_table_reader = 0; ASSERT_EQ(Key(1), Get(Key(1))); - ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5); ASSERT_EQ(num_new_table_reader, 0); num_table_cache_lookup = 0; @@ -519,13 +520,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) { // May preload table cache too. ASSERT_GE(num_table_cache_lookup, 1); old_num_table_cache_lookup2 = num_table_cache_lookup; - // One for compaction input, one for verifying compaction results. - ASSERT_EQ(num_new_table_reader, 2); + // One for verifying compaction results. + // No new iterator created for compaction. + ASSERT_EQ(num_new_table_reader, 1); num_table_cache_lookup = 0; num_new_table_reader = 0; ASSERT_EQ(Key(1), Get(Key(1))); - ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 2); + ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3); ASSERT_EQ(num_new_table_reader, 0); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -731,7 +733,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { // Now all column families qualify compaction but only one should be // scheduled, because no column family hits speed up condition. - ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); // Create two more files for one column family, which triggers speed up // condition, three compactions will be scheduled. @@ -745,7 +747,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, NumTableFilesAtLevel(0, 2)); } - ASSERT_EQ(3, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); // Unblock all threads to unblock all compactions. for (size_t i = 0; i < kTotalTasks; i++) { @@ -776,7 +778,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { // Now all column families qualify compaction but only one should be // scheduled, because no column family hits speed up condition. - ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); for (size_t i = 0; i < kTotalTasks; i++) { sleeping_tasks[i].WakeUp(); @@ -3520,76 +3522,238 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(DBCompactionTest, LevelPeriodicCompaction) { - const int kNumKeysPerFile = 32; - const int kNumLevelFiles = 2; +TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { const int kValueSize = 100; - Options options = CurrentOptions(); - options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days - options.max_open_files = -1; // needed for ttl compaction - env_->time_elapse_only_sleep_ = false; - options.env = env_; + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + if (if_open_all_files) { + options.max_open_files = -1; + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 2; + }); + // In the case where all files are opened and doing DB restart + // forcing the oldest ancester time in manifest file to be 0 to + // simulate the case of reading from an old version. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); + + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); - env_->addon_time_.store(0); - DestroyAndReopen(options); + int ttl_compactions = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kTtl) { + ttl_compactions++; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // Add two L6 files with key ranges: [1 .. 100], [101 .. 200]. + Random rnd(301); + for (int i = 1; i <= 100; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + for (int i = 101; i <= 200; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + MoveFilesToLevel(6); + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); - int periodic_compactions = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); - auto compaction_reason = compaction->compaction_reason(); - if (compaction_reason == CompactionReason::kPeriodicCompaction) { - periodic_compactions++; - } - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // Add two L4 files with key ranges: [1 .. 50], [51 .. 150]. + for (int i = 1; i <= 50; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + for (int i = 51; i <= 150; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + MoveFilesToLevel(4); + ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel()); - Random rnd(301); - for (int i = 0; i < kNumLevelFiles; ++i) { - for (int j = 0; j < kNumKeysPerFile; ++j) { - ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + // Add one L1 file with key range: [26, 75]. + for (int i = 26; i <= 75; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + MoveFilesToLevel(1); + ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel()); + + // LSM tree: + // L1: [26 .. 75] + // L4: [1 .. 50][51 ..... 150] + // L6: [1 ........ 100][101 .... 200] + // + // On TTL expiry, TTL compaction should be initiated on L1 file, and the + // compactions should keep going on until the key range hits bottom level. + // In other words: the compaction on this data range "cascasdes" until + // reaching the bottom level. + // + // Order of events on TTL expiry: + // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the + // ttl + // compaction. + // 2. A TTL compaction happens between L3 and L4 files. Output file in L4. + // 3. The new output file from L4 falls to L5 via 1 trival move initiated + // by the ttl compaction. + // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6. + + // Add 25 hours and do a write + env_->addon_time_.fetch_add(25 * 60 * 60); + + ASSERT_OK(Put(Key(1), "1")); + if (if_restart) { + Reopen(options); + } else { + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_EQ(5, ttl_compactions); + + env_->addon_time_.fetch_add(25 * 60 * 60); + ASSERT_OK(Put(Key(2), "1")); + if (if_restart) { + Reopen(options); + } else { + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel()); + ASSERT_GE(ttl_compactions, 6); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } - Flush(); } - dbfull()->TEST_WaitForCompact(); +} - ASSERT_EQ("2", FilesPerLevel()); - ASSERT_EQ(0, periodic_compactions); +TEST_F(DBCompactionTest, LevelPeriodicCompaction) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; - // Add 50 hours and do a write - env_->addon_time_.fetch_add(50 * 60 * 60); - ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); - // Assert that the files stay in the same level - ASSERT_EQ("3", FilesPerLevel()); - // The two old files go through the periodic compaction process - ASSERT_EQ(2, periodic_compactions); + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + if (if_open_all_files) { + options.max_open_files = -1; // needed for ttl compaction + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 0; + }); + // In the case where all files are opened and doing DB restart + // forcing the file creation time in manifest file to be 0 to + // simulate the case of reading from an old version. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); + + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); - MoveFilesToLevel(1); - ASSERT_EQ("0,3", FilesPerLevel()); + int periodic_compactions = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), + RandomString(&rnd, kValueSize))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); - // Add another 50 hours and do another write - env_->addon_time_.fetch_add(50 * 60 * 60); - ASSERT_OK(Put("b", "2")); - Flush(); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("1,3", FilesPerLevel()); - // The three old files now go through the periodic compaction process. 2 + 3. - ASSERT_EQ(5, periodic_compactions); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); - // Add another 50 hours and do another write - env_->addon_time_.fetch_add(50 * 60 * 60); - ASSERT_OK(Put("c", "3")); - Flush(); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("2,3", FilesPerLevel()); - // The four old files now go through the periodic compaction process. 5 + 4. - ASSERT_EQ(9, periodic_compactions); + // Add 50 hours and do a write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("a", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + MoveFilesToLevel(1); + ASSERT_EQ("0,3", FilesPerLevel()); + + // Add another 50 hours and do another write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("b", "2")); + if (if_restart) { + Reopen(options); + } else { + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("1,3", FilesPerLevel()); + // The three old files now go through the periodic compaction process. 2 + // + 3. + ASSERT_EQ(5, periodic_compactions); + + // Add another 50 hours and do another write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("c", "3")); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("2,3", FilesPerLevel()); + // The four old files now go through the periodic compaction process. 5 + // + 4. + ASSERT_EQ(9, periodic_compactions); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } + } } TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { @@ -3602,7 +3766,6 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { const int kValueSize = 100; Options options = CurrentOptions(); - options.max_open_files = -1; // needed for ttl compaction env_->time_elapse_only_sleep_ = false; options.env = env_; @@ -3747,6 +3910,91 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) { + class TestCompactionFilter : public CompactionFilter { + const char* Name() const override { return "TestCompactionFilter"; } + }; + class TestCompactionFilterFactory : public CompactionFilterFactory { + const char* Name() const override { return "TestCompactionFilterFactory"; } + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new TestCompactionFilter()); + } + }; + + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Random rnd(301); + + Options options = CurrentOptions(); + TestCompactionFilter test_compaction_filter; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + env_->addon_time_.store(0); + + enum CompactionFilterType { + kUseCompactionFilter, + kUseCompactionFilterFactory + }; + + for (CompactionFilterType comp_filter_type : + {kUseCompactionFilter, kUseCompactionFilterFactory}) { + // Assert that periodic compactions are not enabled. + ASSERT_EQ(port::kMaxUint64 - 1, options.periodic_compaction_seconds); + + if (comp_filter_type == kUseCompactionFilter) { + options.compaction_filter = &test_compaction_filter; + options.compaction_filter_factory.reset(); + } else if (comp_filter_type == kUseCompactionFilterFactory) { + options.compaction_filter = nullptr; + options.compaction_filter_factory.reset( + new TestCompactionFilterFactory()); + } + DestroyAndReopen(options); + + // periodic_compaction_seconds should be set to the sanitized value when + // a compaction filter or a compaction filter factory is used. + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + int periodic_compactions = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Add 31 days and do a write + env_->addon_time_.fetch_add(31 * 24 * 60 * 60); + ASSERT_OK(Put("a", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } +} TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) { // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual @@ -3890,11 +4138,17 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) { } Flush(1); } - auto manual_compaction_thread = port::Thread([this]() { + auto manual_compaction_thread = port::Thread([this, i]() { CompactRangeOptions cro; cro.allow_write_stall = false; - ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) - .IsShutdownInProgress()); + Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr); + if (i == 0) { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsColumnFamilyDropped()); + } else { + ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr) + .IsShutdownInProgress()); + } }); TEST_SYNC_POINT( @@ -4158,7 +4412,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" }; - const int cf_count = sizeof cf_names / sizeof cf_names[0]; + const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0]; std::unordered_map cf_to_limiter; @@ -4177,7 +4431,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { std::vector option_vector; option_vector.reserve(cf_count); - for (int cf = 0; cf < cf_count; cf++) { + for (unsigned int cf = 0; cf < cf_count; cf++) { ColumnFamilyOptions cf_opt(options); if (cf == 0) { // "Default" CF does't use compaction limiter @@ -4195,7 +4449,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { option_vector.emplace_back(DBOptions(options), cf_opt); } - for (int cf = 1; cf < cf_count; cf++) { + for (unsigned int cf = 1; cf < cf_count; cf++) { CreateColumnFamilies({cf_names[cf]}, option_vector[cf]); } @@ -4247,7 +4501,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { int keyIndex = 0; for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) { - for (int cf = 0; cf < cf_count; cf++) { + for (unsigned int cf = 0; cf < cf_count; cf++) { for (int i = 0; i < kNumKeysPerFile; i++) { ASSERT_OK(Put(cf, Key(keyIndex++), "")); } @@ -4255,13 +4509,13 @@ TEST_F(DBCompactionTest, CompactionLimiter) { ASSERT_OK(Put(cf, "", "")); } - for (int cf = 0; cf < cf_count; cf++) { + for (unsigned int cf = 0; cf < cf_count; cf++) { dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); } } // Enough L0 files to trigger compaction - for (int cf = 0; cf < cf_count; cf++) { + for (unsigned int cf = 0; cf < cf_count; cf++) { ASSERT_EQ(NumTableFilesAtLevel(0, cf), options.level0_file_num_compaction_trigger); } @@ -4288,7 +4542,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { sleeping_compact_tasks[i].WaitUntilDone(); } - for (int cf = 0; cf < cf_count; cf++) { + for (unsigned int cf = 0; cf < cf_count; cf++) { dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); } @@ -4333,12 +4587,6 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) { options.env = new MockEnv(Env::Default()); Reopen(options); bool readahead = false; - SyncPoint::GetInstance()->SetCallBack( - "TableCache::NewIterator:for_compaction", [&](void* arg) { - bool* use_direct_reads = static_cast(arg); - ASSERT_EQ(*use_direct_reads, - options.use_direct_reads); - }); SyncPoint::GetInstance()->SetCallBack( "CompactionJob::OpenCompactionOutputFile", [&](void* arg) { bool* use_direct_writes = static_cast(arg); @@ -4551,6 +4799,36 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) { ASSERT_EQ(num, 0); } +TEST_F(DBCompactionTest, CompactionDuringShutdown) { + Options opts = CurrentOptions(); + opts.level0_file_num_compaction_trigger = 2; + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + ColumnFamilyHandleImpl* cfh = + static_cast(dbfull()->DefaultColumnFamily()); + ColumnFamilyData* cfd = cfh->cfd(); + InternalStats* internal_stats_ptr = cfd->internal_stats(); + ASSERT_NE(internal_stats_ptr, nullptr); + + Random rnd(301); + for (auto i = 0; i < 2; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024))); + } + Flush(); + } + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", + [&](void* /*arg*/) { + dbfull()->shutting_down_.store(true); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(dbfull()->error_handler_.GetBGError()); +} + // FixFileIngestionCompactionDeadlock tests and verifies that compaction and // file ingestion do not cause deadlock in the event of write stall triggered // by number of L0 files reaching level0_stop_writes_trigger. @@ -4628,6 +4906,201 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { Close(); } +TEST_F(DBCompactionTest, ConsistencyFailTest) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "VersionBuilder::CheckConsistency", [&](void* arg) { + auto p = + reinterpret_cast*>(arg); + // just swap the two FileMetaData so that we hit error + // in CheckConsistency funcion + FileMetaData* temp = *(p->first); + *(p->first) = *(p->second); + *(p->second) = temp; + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + for (int k = 0; k < 2; ++k) { + ASSERT_OK(Put("foo", "bar")); + Flush(); + } + + ASSERT_NOK(Put("foo", "bar")); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +void IngestOneKeyValue(DBImpl* db, const std::string& key, + const std::string& value, const Options& options) { + ExternalSstFileInfo info; + std::string f = test::PerThreadDBPath("sst_file" + key); + EnvOptions env; + rocksdb::SstFileWriter writer(env, options); + auto s = writer.Open(f); + ASSERT_OK(s); + // ASSERT_OK(writer.Put(Key(), "")); + ASSERT_OK(writer.Put(key, value)); + + ASSERT_OK(writer.Finish(&info)); + IngestExternalFileOptions ingest_opt; + + ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt)); +} + +TEST_P(DBCompactionTestWithParam, + FlushAfterIntraL0CompactionCheckConsistencyFail) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::atomic pick_intra_l0_count(0); + std::string value(RandomString(&rnd, kValueSize)); + + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompactionTestWithParam::FlushAfterIntraL0:1", + "CompactionJob::Run():Start"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", + [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // prevents trivial move + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(i), "")); // prevents trivial move + } + ASSERT_OK(Flush()); + Compact("", Key(99)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // Flush 5 L0 sst. + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i + 1), value)); + ASSERT_OK(Flush()); + } + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + + // Put one key, to make smallest log sequence number in this memtable is less + // than sst which would be ingested in next step. + ASSERT_OK(Put(Key(0), "a")); + + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + + // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction. + for (int i = 5; i < 10; i++) { + IngestOneKeyValue(dbfull(), Key(i), value, options); + ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); + } + + TEST_SYNC_POINT("DBCompactionTestWithParam::FlushAfterIntraL0:1"); + // Put one key, to make biggest log sequence number in this memtable is bigger + // than sst which would be ingested in next step. + ASSERT_OK(Put(Key(2), "b")); + ASSERT_EQ(10, NumTableFilesAtLevel(0)); + dbfull()->TEST_WaitForCompact(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GT(level_to_files[0].size(), 0); + ASSERT_GT(pick_intra_l0_count.load(), 0); + + ASSERT_OK(Flush()); +} + +TEST_P(DBCompactionTestWithParam, + IntraL0CompactionAfterFlushCheckConsistencyFail) { + Options options = CurrentOptions(); + options.force_consistency_checks = true; + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = 5; + options.max_background_compactions = 2; + options.max_subcompactions = max_subcompactions_; + options.write_buffer_size = 2 << 20; + options.max_write_buffer_number = 6; + DestroyAndReopen(options); + + const size_t kValueSize = 1 << 20; + Random rnd(301); + std::string value(RandomString(&rnd, kValueSize)); + std::string value2(RandomString(&rnd, kValueSize)); + std::string bigvalue = value + value; + + // prevents trivial move + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(i), "")); // prevents trivial move + } + ASSERT_OK(Flush()); + Compact("", Key(99)); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + std::atomic pick_intra_l0_count(0); + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1", + "CompactionJob::Run():Start"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", + [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + // Make 6 L0 sst. + for (int i = 0; i < 6; ++i) { + if (i % 2 == 0) { + IngestOneKeyValue(dbfull(), Key(i), value, options); + } else { + ASSERT_OK(Put(Key(i), value)); + ASSERT_OK(Flush()); + } + } + + ASSERT_EQ(6, NumTableFilesAtLevel(0)); + + // Stop run flush job + env_->SetBackgroundThreads(1, Env::HIGH); + test::SleepingBackgroundTask sleeping_tasks; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks, + Env::Priority::HIGH); + sleeping_tasks.WaitUntilSleeping(); + + // Put many keys to make memtable request to flush + for (int i = 0; i < 6; ++i) { + ASSERT_OK(Put(Key(i), bigvalue)); + } + + ASSERT_EQ(6, NumTableFilesAtLevel(0)); + // ingest file to trigger IntraL0Compaction + for (int i = 6; i < 10; ++i) { + ASSERT_EQ(i, NumTableFilesAtLevel(0)); + IngestOneKeyValue(dbfull(), Key(i), value2, options); + } + ASSERT_EQ(10, NumTableFilesAtLevel(0)); + + // Wake up flush job + sleeping_tasks.WakeUp(); + sleeping_tasks.WaitUntilDone(); + TEST_SYNC_POINT("DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1"); + dbfull()->TEST_WaitForCompact(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + uint64_t error_count = 0; + db_->GetIntProperty("rocksdb.background-errors", &error_count); + ASSERT_EQ(error_count, 0); + ASSERT_GT(pick_intra_l0_count.load(), 0); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(bigvalue, Get(Key(i))); + } + for (int i = 6; i < 10; ++i) { + ASSERT_EQ(value2, Get(Key(i))); + } +} + #endif // !defined(ROCKSDB_LITE) } // namespace rocksdb diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc index 46ba411b6fd..bc72744656e 100644 --- a/db/db_encryption_test.cc +++ b/db/db_encryption_test.cc @@ -7,7 +7,7 @@ #include "port/stack_trace.h" #include "rocksdb/perf_context.h" #if !defined(ROCKSDB_LITE) -#include "util/sync_point.h" +#include "test_util/sync_point.h" #endif #include #include @@ -83,6 +83,34 @@ TEST_F(DBEncryptionTest, CheckEncrypted) { } } +TEST_F(DBEncryptionTest, ReadEmptyFile) { + auto defaultEnv = Env::Default(); + + // create empty file for reading it back in later + auto envOptions = EnvOptions(CurrentOptions()); + auto filePath = dbname_ + "/empty.empty"; + + Status status; + { + std::unique_ptr writableFile; + status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions); + ASSERT_OK(status); + } + + std::unique_ptr seqFile; + status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions); + ASSERT_OK(status); + + std::string scratch; + Slice data; + // reading back 16 bytes from the empty file shouldn't trigger an assertion. + // it should just work and return an empty string + status = seqFile->Read(16, &data, (char*)scratch.data()); + ASSERT_OK(status); + + ASSERT_TRUE(data.empty()); +} + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index ace0befb6d5..dd5f8f67f09 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -6,24 +6,20 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include #include +#include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/job_context.h" #include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "util/file_util.h" -#include "util/filename.h" +#include "test_util/sync_point.h" #include "util/mutexlock.h" -#include "util/sync_point.h" namespace rocksdb { @@ -61,7 +57,9 @@ Status DBImpl::EnableFileDeletions(bool force) { } if (file_deletion_enabled) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); - PurgeObsoleteFiles(job_context); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "File Deletions Enable, but not really enabled. Counter: %d", @@ -165,6 +163,15 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { return wal_manager_.GetSortedWalFiles(files); } +Status DBImpl::GetCurrentWalFile(std::unique_ptr* current_log_file) { + uint64_t current_logfile_number; + { + InstrumentedMutexLock l(&mutex_); + current_logfile_number = logfile_number_; + } + + return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file); +} } #endif // ROCKSDB_LITE diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index 09c461f8da4..08a1d8d1be1 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -7,10 +7,16 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" +#include "port/port.h" #include "port/stack_trace.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" namespace rocksdb { @@ -290,6 +296,129 @@ TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) { Close(); } +TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) { + Options options = CurrentOptions(); + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:AfterScheduleFlush", + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"}, + {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BackgroundCallFlush:start", + "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put(1, "key", "value")); + auto* cfd = static_cast(handles_[1])->cfd(); + port::Thread drop_cf_thr([&]() { + TEST_SYNC_POINT( + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_.resize(1); + TEST_SYNC_POINT( + "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree"); + }); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts)); + drop_cf_thr.join(); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) { + class TestListener : public EventListener { + public: + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { + // There's only one key in each flush. + ASSERT_EQ(info.smallest_seqno, info.largest_seqno); + ASSERT_NE(0, info.smallest_seqno); + if (info.smallest_seqno == seq1) { + // First flush completed + ASSERT_FALSE(completed1); + completed1 = true; + CheckFlushResultCommitted(db, seq1); + } else { + // Second flush completed + ASSERT_FALSE(completed2); + completed2 = true; + ASSERT_EQ(info.smallest_seqno, seq2); + CheckFlushResultCommitted(db, seq2); + } + } + + void CheckFlushResultCommitted(DB* db, SequenceNumber seq) { + DBImpl* db_impl = static_cast_with_check(db); + InstrumentedMutex* mutex = db_impl->mutex(); + mutex->Lock(); + auto* cfd = + reinterpret_cast(db->DefaultColumnFamily()) + ->cfd(); + ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber()); + mutex->Unlock(); + } + + std::atomic seq1{0}; + std::atomic seq2{0}; + std::atomic completed1{false}; + std::atomic completed2{false}; + }; + std::shared_ptr listener = std::make_shared(); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:start", + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"}, + {"DBImpl::FlushMemTableToOutputFile:Finish", + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}}); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table", [&listener](void* arg) { + // Wait for the second flush finished, out of mutex. + auto* mems = reinterpret_cast*>(arg); + if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) { + TEST_SYNC_POINT( + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:" + "WaitSecond"); + } + }); + + Options options = CurrentOptions(); + options.create_if_missing = true; + options.listeners.push_back(listener); + // Setting max_flush_jobs = max_background_jobs / 4 = 2. + options.max_background_jobs = 8; + // Allow 2 immutable memtables. + options.max_write_buffer_number = 3; + Reopen(options); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("foo", "v")); + listener->seq1 = db_->GetLatestSequenceNumber(); + // t1 will wait for the second flush complete before committing flush result. + auto t1 = port::Thread([&]() { + // flush_opts.wait = true + ASSERT_OK(db_->Flush(FlushOptions())); + }); + // Wait for first flush started. + TEST_SYNC_POINT( + "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"); + // The second flush will exit early without commit its result. The work + // is delegated to the first flush. + ASSERT_OK(Put("bar", "v")); + listener->seq2 = db_->GetLatestSequenceNumber(); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(db_->Flush(flush_opts)); + t1.join(); + ASSERT_TRUE(listener->completed1); + ASSERT_TRUE(listener->completed2); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} +#endif // !ROCKSDB_LITE + TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -431,7 +560,7 @@ TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) { cf_ids.push_back(cf_id); } ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); - ASSERT_TRUE(Flush(cf_ids).IsShutdownInProgress()); + ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped()); Destroy(options); } @@ -514,6 +643,80 @@ TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) { ASSERT_EQ("value", Get(0, "key")); } +TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) { + bool atomic_flush = GetParam(); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + options.max_write_buffer_number = 4; + // Set min_write_buffer_number_to_merge to be greater than 1, so that + // a column family with one memtable in the imm will not cause IsFlushPending + // to return true when flush_requested_ is false. + options.min_write_buffer_number_to_merge = 2; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(dbfull()->PauseBackgroundWork()); + ASSERT_OK(Put(0, "key00", "value00")); + ASSERT_OK(Put(1, "key10", "value10")); + FlushOptions flush_opts; + flush_opts.wait = false; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + ASSERT_OK(Put(0, "key01", "value01")); + // Since max_write_buffer_number is 4, the following flush won't cause write + // stall. + ASSERT_OK(dbfull()->Flush(flush_opts)); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1])); + handles_[1] = nullptr; + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + delete handles_[0]; + handles_.clear(); +} + +TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + CreateAndReopenWithCF({"pikachu"}, options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush", + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"}, + {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree", + "DBImpl::BackgroundCallFlush:start"}, + {"DBImpl::BackgroundCallFlush:start", + "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(Put(0, "key", "value")); + ASSERT_OK(Put(1, "key", "value")); + auto* cfd_default = + static_cast(dbfull()->DefaultColumnFamily()) + ->cfd(); + auto* cfd_pikachu = static_cast(handles_[1])->cfd(); + port::Thread drop_cf_thr([&]() { + TEST_SYNC_POINT( + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + delete handles_[1]; + handles_.resize(1); + TEST_SYNC_POINT( + "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree"); + }); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu}, + flush_opts)); + drop_cf_thr.join(); + Close(); + SyncPoint::GetInstance()->DisableProcessing(); +} + INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, testing::Bool()); diff --git a/db/db_impl.cc b/db/db_impl/db_impl.cc similarity index 78% rename from db/db_impl.cc rename to db/db_impl/db_impl.cc index c6268d0cb80..ee73cc3fd75 100644 --- a/db/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -6,17 +6,15 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif #include #ifdef OS_SOLARIS #include #endif #include +#include #include #include #include @@ -27,8 +25,9 @@ #include #include +#include "db/arena_wrapped_db_iter.h" #include "db/builder.h" -#include "db/compaction_job.h" +#include "db/compaction/compaction_job.h" #include "db/db_info_dumper.h" #include "db/db_iter.h" #include "db/dbformat.h" @@ -37,7 +36,7 @@ #include "db/external_sst_file_ingestion_job.h" #include "db/flush_job.h" #include "db/forward_iterator.h" -#include "db/in_memory_stats_history.h" +#include "db/import_column_family_job.h" #include "db/job_context.h" #include "db/log_reader.h" #include "db/log_writer.h" @@ -53,10 +52,19 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "db/write_callback.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "logging/auto_roll_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "memtable/hash_skiplist_rep.h" +#include "monitoring/in_memory_stats_history.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" #include "options/cf_options.h" @@ -74,33 +82,29 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/write_buffer_manager.h" -#include "table/block.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" #include "table/get_context.h" #include "table/merging_iterator.h" #include "table/multiget_context.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" +#include "test_util/sync_point.h" #include "tools/sst_dump_tool_imp.h" -#include "util/auto_roll_logger.h" #include "util/autovector.h" #include "util/build_version.h" +#include "util/cast_util.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" -#include "util/file_reader_writer.h" -#include "util/file_util.h" -#include "util/filename.h" -#include "util/log_buffer.h" -#include "util/logging.h" #include "util/mutexlock.h" -#include "util/sst_file_manager_impl.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { const std::string kDefaultColumnFamilyName("default"); +const std::string kPersistentStatsColumnFamilyName( + "___rocksdb_stats_history___"); void DumpRocksDBBuildVersion(Logger* log); CompressionType GetCompressionFlush( @@ -161,10 +165,12 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, batch_per_txn_(batch_per_txn), db_lock_(nullptr), shutting_down_(false), + manual_compaction_paused_(false), bg_cv_(&mutex_), logfile_number_(0), log_dir_synced_(false), log_empty_(true), + persist_stats_cf_handle_(nullptr), log_sync_cv_(&mutex_), total_log_size_(0), is_snapshot_supported_(true), @@ -235,12 +241,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, const int table_cache_size = (mutable_db_options_.max_open_files == -1) ? TableCache::kInfiniteCapacity : mutable_db_options_.max_open_files - 10; - table_cache_ = NewLRUCache(table_cache_size, - immutable_db_options_.table_cache_numshardbits); + LRUCacheOptions co; + co.capacity = table_cache_size; + co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; + co.metadata_charge_policy = kDontChargeCacheMetadata; + table_cache_ = NewLRUCache(co); versions_.reset(new VersionSet(dbname_, &immutable_db_options_, env_options_, table_cache_.get(), write_buffer_manager_, - &write_controller_)); + &write_controller_, &block_cache_tracer_)); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); @@ -468,6 +477,7 @@ Status DBImpl::CloseHelper() { &files_grabbed_for_purge_); EraseThreadStatusDbInfo(); flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); while (!flush_queue_.empty()) { const FlushRequest& flush_req = PopFirstFromFlushQueue(); @@ -485,10 +495,17 @@ Status DBImpl::CloseHelper() { } } - if (default_cf_handle_ != nullptr) { + if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) { // we need to delete handle outside of lock because it does its own locking mutex_.Unlock(); - delete default_cf_handle_; + if (default_cf_handle_) { + delete default_cf_handle_; + default_cf_handle_ = nullptr; + } + if (persist_stats_cf_handle_) { + delete persist_stats_cf_handle_; + persist_stats_cf_handle_ = nullptr; + } mutex_.Lock(); } @@ -582,6 +599,12 @@ Status DBImpl::CloseHelper() { ret = s; } } + if (ret.IsAborted()) { + // Reserve IsAborted() error for those where users didn't release + // certain resource and they can release them and come back and + // retry. In this case, we wrap this exception to something else. + return Status::Incomplete(ret.ToString()); + } return ret; } @@ -631,7 +654,7 @@ void DBImpl::StartTimedTasks() { if (!thread_dump_stats_) { thread_dump_stats_.reset(new rocksdb::RepeatableThread( [this]() { DBImpl::DumpStats(); }, "dump_st", env_, - stats_dump_period_sec * 1000000)); + static_cast(stats_dump_period_sec) * kMicrosInSecond)); } } stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec; @@ -639,14 +662,14 @@ void DBImpl::StartTimedTasks() { if (!thread_persist_stats_) { thread_persist_stats_.reset(new rocksdb::RepeatableThread( [this]() { DBImpl::PersistStats(); }, "pst_st", env_, - stats_persist_period_sec * 1000000)); + static_cast(stats_persist_period_sec) * kMicrosInSecond)); } } } } // esitmate the total size of stats_history_ -size_t DBImpl::EstiamteStatsHistorySize() const { +size_t DBImpl::EstimateInMemoryStatsHistorySize() const { size_t size_total = sizeof(std::map>); if (stats_history_.size() == 0) return size_total; @@ -668,7 +691,7 @@ void DBImpl::PersistStats() { if (shutdown_initiated_) { return; } - uint64_t now_micros = env_->NowMicros(); + uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond; Statistics* statistics = immutable_db_options_.statistics.get(); if (!statistics) { return; @@ -679,12 +702,50 @@ void DBImpl::PersistStats() { stats_history_size_limit = mutable_db_options_.stats_history_buffer_size; } - // TODO(Zhongyi): also persist immutable_db_options_.statistics - { - std::map stats_map; - if (!statistics->getTickerMap(&stats_map)) { - return; + std::map stats_map; + if (!statistics->getTickerMap(&stats_map)) { + return; + } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "------- PERSISTING STATS -------"); + + if (immutable_db_options_.persist_stats_to_disk) { + WriteBatch batch; + if (stats_slice_initialized_) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Reading %" ROCKSDB_PRIszt " stats from statistics\n", + stats_slice_.size()); + for (const auto& stat : stats_map) { + char key[100]; + int length = + EncodePersistentStatsKey(now_seconds, stat.first, 100, key); + // calculate the delta from last time + if (stats_slice_.find(stat.first) != stats_slice_.end()) { + uint64_t delta = stat.second - stats_slice_[stat.first]; + batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)), + ToString(delta)); + } + } } + stats_slice_initialized_ = true; + std::swap(stats_slice_, stats_map); + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + Status s = Write(wo, &batch); + if (!s.ok()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing to persistent stats CF failed -- %s", + s.ToString().c_str()); + } else { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to persistent stats CF succeeded", + stats_slice_.size(), now_seconds); + } + // TODO(Zhongyi): add purging for persisted data + } else { InstrumentedMutexLock l(&stats_history_mutex_); // calculate the delta from last time if (stats_slice_initialized_) { @@ -694,20 +755,33 @@ void DBImpl::PersistStats() { stats_delta[stat.first] = stat.second - stats_slice_[stat.first]; } } - stats_history_[now_micros] = stats_delta; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 + " to in-memory stats history", + stats_slice_.size(), now_seconds); + stats_history_[now_seconds] = stats_delta; } stats_slice_initialized_ = true; std::swap(stats_slice_, stats_map); TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied"); // delete older stats snapshots to control memory consumption - bool purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit; + size_t stats_history_size = EstimateInMemoryStatsHistorySize(); + bool purge_needed = stats_history_size > stats_history_size_limit; + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); while (purge_needed && !stats_history_.empty()) { stats_history_.erase(stats_history_.begin()); - purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit; + purge_needed = + EstimateInMemoryStatsHistorySize() > stats_history_size_limit; } + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt + " bytes, slice count: %" ROCKSDB_PRIszt, + stats_history_size, stats_history_.size()); } - // TODO: persist stats to disk #endif // !ROCKSDB_LITE } @@ -738,8 +812,13 @@ Status DBImpl::GetStatsHistory( if (!stats_iterator) { return Status::InvalidArgument("stats_iterator not preallocated."); } - stats_iterator->reset( - new InMemoryStatsHistoryIterator(start_time, end_time, this)); + if (immutable_db_options_.persist_stats_to_disk) { + stats_iterator->reset( + new PersistentStatsHistoryIterator(start_time, end_time, this)); + } else { + stats_iterator->reset( + new InMemoryStatsHistoryIterator(start_time, end_time, this)); + } return (*stats_iterator)->status(); } @@ -792,6 +871,24 @@ void DBImpl::DumpStats() { PrintStatistics(); } +Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str) { + auto* cfh = + static_cast_with_check( + column_family); + ColumnFamilyData* cfd = cfh->cfd(); + + SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); + Version* version = super_version->current; + + Status s = + version->TablesRangeTombstoneSummary(max_entries_to_print, out_str); + + CleanupSuperVersion(super_version); + return s; +} + void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { if (!job_context->logs_to_free.empty()) { for (auto l : job_context->logs_to_free) { @@ -811,16 +908,6 @@ Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { return ret_dir; } -Directory* DBImpl::Directories::GetDataDir(size_t path_id) const { - assert(path_id < data_dirs_.size()); - Directory* ret_dir = data_dirs_[path_id].get(); - if (ret_dir == nullptr) { - // Should use db_dir_ - return db_dir_.get(); - } - return ret_dir; -} - Status DBImpl::SetOptions( ColumnFamilyHandle* column_family, const std::unordered_map& options_map) { @@ -842,8 +929,9 @@ Status DBImpl::SetOptions( Status persist_options_status; SuperVersionContext sv_context(/* create_superversion */ true); { + auto db_options = GetDBOptions(); InstrumentedMutexLock l(&mutex_); - s = cfd->SetOptions(options_map); + s = cfd->SetOptions(db_options, options_map); if (s.ok()) { new_options = *cfd->GetLatestMutableCFOptions(); // Append new version to recompute compaction score. @@ -906,6 +994,25 @@ Status DBImpl::SetDBOptions( InstrumentedMutexLock l(&mutex_); s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map, &new_options); + if (new_options.bytes_per_sync == 0) { + new_options.bytes_per_sync = 1024 * 1024; + } + DBOptions new_db_options = + BuildDBOptions(immutable_db_options_, new_options); + if (s.ok()) { + s = ValidateOptions(new_db_options); + } + if (s.ok()) { + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->IsDropped()) { + auto cf_options = c->GetLatestCFOptions(); + s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options); + if (!s.ok()) { + break; + } + } + } + } if (s.ok()) { if (new_options.max_background_compactions > mutable_db_options_.max_background_compactions) { @@ -923,7 +1030,8 @@ Status DBImpl::SetDBOptions( if (new_options.stats_dump_period_sec > 0) { thread_dump_stats_.reset(new rocksdb::RepeatableThread( [this]() { DBImpl::DumpStats(); }, "dump_st", env_, - new_options.stats_dump_period_sec * 1000000)); + static_cast(new_options.stats_dump_period_sec) * + kMicrosInSecond)); } else { thread_dump_stats_.reset(); } @@ -938,7 +1046,8 @@ Status DBImpl::SetDBOptions( if (new_options.stats_persist_period_sec > 0) { thread_persist_stats_.reset(new rocksdb::RepeatableThread( [this]() { DBImpl::PersistStats(); }, "pst_st", env_, - new_options.stats_persist_period_sec * 1000000)); + static_cast(new_options.stats_persist_period_sec) * + kMicrosInSecond)); } else { thread_persist_stats_.reset(); } @@ -950,15 +1059,12 @@ Status DBImpl::SetDBOptions( : new_options.max_open_files - 10); wal_changed = mutable_db_options_.wal_bytes_per_sync != new_options.wal_bytes_per_sync; - if (new_options.bytes_per_sync == 0) { - new_options.bytes_per_sync = 1024 * 1024; - } mutable_db_options_ = new_options; - env_options_for_compaction_ = EnvOptions( - BuildDBOptions(immutable_db_options_, mutable_db_options_)); + env_options_for_compaction_ = EnvOptions(new_db_options); env_options_for_compaction_ = env_->OptimizeForCompactionTableWrite( env_options_for_compaction_, immutable_db_options_); versions_->ChangeEnvOptions(mutable_db_options_); + //TODO(xiez): clarify why apply optimize for read to write options env_options_for_compaction_ = env_->OptimizeForCompactionTableRead( env_options_for_compaction_, immutable_db_options_); env_options_for_compaction_.compaction_readahead_size = @@ -1026,10 +1132,13 @@ int DBImpl::FindMinimumEmptyLevelFitting( Status DBImpl::FlushWAL(bool sync) { if (manual_wal_flush_) { - // We need to lock log_write_mutex_ since logs_ might change concurrently - InstrumentedMutexLock wl(&log_write_mutex_); - log::Writer* cur_log_writer = logs_.back().writer; - auto s = cur_log_writer->WriteBuffer(); + Status s; + { + // We need to lock log_write_mutex_ since logs_ might change concurrently + InstrumentedMutexLock wl(&log_write_mutex_); + log::Writer* cur_log_writer = logs_.back().writer; + s = cur_log_writer->WriteBuffer(); + } if (!s.ok()) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", s.ToString().c_str()); @@ -1205,32 +1314,28 @@ void DBImpl::SchedulePurge() { void DBImpl::BackgroundCallPurge() { mutex_.Lock(); - // We use one single loop to clear both queues so that after existing the loop - // both queues are empty. This is stricter than what is needed, but can make - // it easier for us to reason the correctness. - while (!purge_queue_.empty() || !logs_to_free_queue_.empty()) { - // Check logs_to_free_queue_ first and close log writers. - if (!logs_to_free_queue_.empty()) { - assert(!logs_to_free_queue_.empty()); - log::Writer* log_writer = *(logs_to_free_queue_.begin()); - logs_to_free_queue_.pop_front(); - mutex_.Unlock(); - delete log_writer; - mutex_.Lock(); - } else { - auto purge_file = purge_queue_.begin(); - auto fname = purge_file->fname; - auto dir_to_sync = purge_file->dir_to_sync; - auto type = purge_file->type; - auto number = purge_file->number; - auto job_id = purge_file->job_id; - purge_queue_.pop_front(); + while (!logs_to_free_queue_.empty()) { + assert(!logs_to_free_queue_.empty()); + log::Writer* log_writer = *(logs_to_free_queue_.begin()); + logs_to_free_queue_.pop_front(); + mutex_.Unlock(); + delete log_writer; + mutex_.Lock(); + } + for (const auto& file : purge_files_) { + const PurgeFileInfo& purge_file = file.second; + const std::string& fname = purge_file.fname; + const std::string& dir_to_sync = purge_file.dir_to_sync; + FileType type = purge_file.type; + uint64_t number = purge_file.number; + int job_id = purge_file.job_id; - mutex_.Unlock(); - DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number); - mutex_.Lock(); - } + mutex_.Unlock(); + DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number); + mutex_.Lock(); } + purge_files_.clear(); + bg_purge_scheduled_--; bg_cv_.SignalAll(); @@ -1350,22 +1455,29 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { return default_cf_handle_; } +ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const { + return persist_stats_cf_handle_; +} + Status DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { - return GetImpl(read_options, column_family, key, value); + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = value; + return GetImpl(read_options, key, get_impl_options); } -Status DBImpl::GetImpl(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* pinnable_val, bool* value_found, - ReadCallback* callback, bool* is_blob_index) { - assert(pinnable_val != nullptr); +Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, + GetImplOptions get_impl_options) { + assert(get_impl_options.value != nullptr || + get_impl_options.merge_operands != nullptr); PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); StopWatch sw(env_, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); - auto cfh = reinterpret_cast(column_family); + auto cfh = + reinterpret_cast(get_impl_options.column_family); auto cfd = cfh->cfd(); if (tracer_) { @@ -1373,7 +1485,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, // tracing is enabled. InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - tracer_->Get(column_family, key); + tracer_->Get(get_impl_options.column_family, key); } } @@ -1385,9 +1497,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, SequenceNumber snapshot; if (read_options.snapshot != nullptr) { - if (callback) { + if (get_impl_options.callback) { // Already calculated based on read_options.snapshot - snapshot = callback->max_visible_seq(); + snapshot = get_impl_options.callback->max_visible_seq(); } else { snapshot = reinterpret_cast(read_options.snapshot)->number_; @@ -1401,8 +1513,23 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, snapshot = last_seq_same_as_publish_seq_ ? versions_->LastSequence() : versions_->LastPublishedSequence(); - if (callback) { - callback->Refresh(snapshot); + if (get_impl_options.callback) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + get_impl_options.callback->Refresh(snapshot); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + snapshot = get_impl_options.callback->max_visible_seq(); } } TEST_SYNC_POINT("DBImpl::GetImpl:3"); @@ -1416,26 +1543,46 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, // First look in the memtable, then in the immutable memtable (if any). // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. - LookupKey lkey(key, snapshot); + LookupKey lkey(key, snapshot, read_options.timestamp); PERF_TIMER_STOP(get_snapshot_time); bool skip_memtable = (read_options.read_tier == kPersistedTier && has_unpersisted_data_.load(std::memory_order_relaxed)); bool done = false; if (!skip_memtable) { - if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, - &max_covering_tombstone_seq, read_options, callback, - is_blob_index)) { - done = true; - pinnable_val->PinSelf(); - RecordTick(stats_, MEMTABLE_HIT); - } else if ((s.ok() || s.IsMergeInProgress()) && - sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context, - &max_covering_tombstone_seq, read_options, callback, - is_blob_index)) { - done = true; - pinnable_val->PinSelf(); - RecordTick(stats_, MEMTABLE_HIT); + // Get value associated with key + if (get_impl_options.get_value) { + if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + get_impl_options.value->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s, + &merge_context, &max_covering_tombstone_seq, + read_options, get_impl_options.callback, + get_impl_options.is_blob_index)) { + done = true; + get_impl_options.value->PinSelf(); + RecordTick(stats_, MEMTABLE_HIT); + } + } else { + // Get Merge Operands associated with key, Merge Operands should not be + // merged and raw values should be returned to the user. + if (sv->mem->Get(lkey, nullptr, &s, &merge_context, + &max_covering_tombstone_seq, read_options, nullptr, + nullptr, false)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if ((s.ok() || s.IsMergeInProgress()) && + sv->imm->GetMergeOperands(lkey, &s, &merge_context, + &max_covering_tombstone_seq, + read_options)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } } if (!done && !s.ok() && !s.IsMergeInProgress()) { ReturnAndCleanupSuperVersion(cfd, sv); @@ -1444,9 +1591,14 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, } if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); - sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context, - &max_covering_tombstone_seq, value_found, nullptr, nullptr, - callback, is_blob_index); + sv->current->Get( + read_options, lkey, get_impl_options.value, &s, &merge_context, + &max_covering_tombstone_seq, + get_impl_options.get_value ? get_impl_options.value_found : nullptr, + nullptr, nullptr, + get_impl_options.get_value ? get_impl_options.callback : nullptr, + get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr, + get_impl_options.get_value); RecordTick(stats_, MEMTABLE_MISS); } @@ -1458,7 +1610,25 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, RecordTick(stats_, NUMBER_KEYS_READ); size_t size = 0; if (s.ok()) { - size = pinnable_val->size(); + if (get_impl_options.get_value) { + size = get_impl_options.value->size(); + } else { + // Return all merge operands for get_impl_options.key + *get_impl_options.number_of_operands = + static_cast(merge_context.GetNumOperands()); + if (*get_impl_options.number_of_operands > + get_impl_options.get_merge_operands_options + ->expected_max_number_of_operands) { + s = Status::Incomplete( + Status::SubCode::KMergeOperandsInsufficientCapacity); + } else { + for (const Slice& sl : merge_context.GetOperands()) { + size += sl.size(); + get_impl_options.merge_operands->PinSelf(sl); + get_impl_options.merge_operands++; + } + } + } RecordTick(stats_, BYTES_READ, size); PERF_COUNTER_ADD(get_read_bytes, size); } @@ -1475,14 +1645,9 @@ std::vector DBImpl::MultiGet( StopWatch sw(env_, stats_, DB_MULTIGET); PERF_TIMER_GUARD(get_snapshot_time); - SequenceNumber snapshot; + SequenceNumber consistent_seqnum; + ; - struct MultiGetColumnFamilyData { - ColumnFamilyData* cfd; - SuperVersion* super_version; - MultiGetColumnFamilyData(ColumnFamilyData* cf, SuperVersion* sv) - : cfd(cf), super_version(sv) {} - }; std::unordered_map multiget_cf_data( column_family.size()); for (auto cf : column_family) { @@ -1490,86 +1655,20 @@ std::vector DBImpl::MultiGet( auto cfd = cfh->cfd(); if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { multiget_cf_data.emplace(cfd->GetID(), - MultiGetColumnFamilyData(cfd, nullptr)); + MultiGetColumnFamilyData(cfh, nullptr)); } } - bool last_try = false; - { - // If we end up with the same issue of memtable geting sealed during 2 - // consecutive retries, it means the write rate is very high. In that case - // its probably ok to take the mutex on the 3rd try so we can succeed for - // sure - static const int num_retries = 3; - for (auto i = 0; i < num_retries; ++i) { - last_try = (i == num_retries - 1); - bool retry = false; + std::function::iterator&)> + iter_deref_lambda = + [](std::unordered_map::iterator& + cf_iter) { return &cf_iter->second; }; - if (i > 0) { - for (auto mgd_iter = multiget_cf_data.begin(); - mgd_iter != multiget_cf_data.end(); ++mgd_iter) { - auto super_version = mgd_iter->second.super_version; - auto cfd = mgd_iter->second.cfd; - if (super_version != nullptr) { - ReturnAndCleanupSuperVersion(cfd, super_version); - } - mgd_iter->second.super_version = nullptr; - } - } - - if (read_options.snapshot == nullptr) { - if (last_try) { - TEST_SYNC_POINT("DBImpl::MultiGet::LastTry"); - // We're close to max number of retries. For the last retry, - // acquire the lock so we're sure to succeed - mutex_.Lock(); - } - snapshot = last_seq_same_as_publish_seq_ - ? versions_->LastSequence() - : versions_->LastPublishedSequence(); - } else { - snapshot = reinterpret_cast(read_options.snapshot) - ->number_; - } - - for (auto mgd_iter = multiget_cf_data.begin(); - mgd_iter != multiget_cf_data.end(); ++mgd_iter) { - if (!last_try) { - mgd_iter->second.super_version = - GetAndRefSuperVersion(mgd_iter->second.cfd); - } else { - mgd_iter->second.super_version = - mgd_iter->second.cfd->GetSuperVersion()->Ref(); - } - TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV"); - if (read_options.snapshot != nullptr || last_try) { - // If user passed a snapshot, then we don't care if a memtable is - // sealed or compaction happens because the snapshot would ensure - // that older key versions are kept around. If this is the last - // retry, then we have the lock so nothing bad can happen - continue; - } - // We could get the earliest sequence number for the whole list of - // memtables, which will include immutable memtables as well, but that - // might be tricky to maintain in case we decide, in future, to do - // memtable compaction. - if (!last_try) { - auto seq = - mgd_iter->second.super_version->mem->GetEarliestSequenceNumber(); - if (seq > snapshot) { - retry = true; - break; - } - } - } - if (!retry) { - if (last_try) { - mutex_.Unlock(); - } - break; - } - } - } + bool unref_only = + MultiCFSnapshot>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); // Contain a list of merge operations if merge occurs. MergeContext merge_context; @@ -1593,7 +1692,7 @@ std::vector DBImpl::MultiGet( Status& s = stat_list[i]; std::string* value = &(*values)[i]; - LookupKey lkey(keys[i], snapshot); + LookupKey lkey(keys[i], consistent_seqnum); auto cfh = reinterpret_cast(column_family[i]); SequenceNumber max_covering_tombstone_seq = 0; auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); @@ -1637,7 +1736,7 @@ std::vector DBImpl::MultiGet( for (auto mgd_iter : multiget_cf_data) { auto mgd = mgd_iter.second; - if (!last_try) { + if (!unref_only) { ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version); } else { mgd.cfd->GetSuperVersion()->Unref(); @@ -1654,107 +1753,333 @@ std::vector DBImpl::MultiGet( return stat_list; } +template +bool DBImpl::MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot) { + PERF_TIMER_GUARD(get_snapshot_time); + + bool last_try = false; + if (cf_list->size() == 1) { + // Fast path for a single column family. We can simply get the thread loca + // super version + auto cf_iter = cf_list->begin(); + auto node = iter_deref_func(cf_iter); + node->super_version = GetAndRefSuperVersion(node->cfd); + if (read_options.snapshot != nullptr) { + // Note: In WritePrepared txns this is not necessary but not harmful + // either. Because prep_seq > snapshot => commit_seq > snapshot so if + // a snapshot is specified we should be fine with skipping seq numbers + // that are greater than that. + // + // In WriteUnprepared, we cannot set snapshot in the lookup key because we + // may skip uncommitted data that should be visible to the transaction for + // reading own writes. + *snapshot = + static_cast(read_options.snapshot)->number_; + if (callback) { + *snapshot = std::max(*snapshot, callback->max_visible_seq()); + } + } else { + // Since we get and reference the super version before getting + // the snapshot number, without a mutex protection, it is possible + // that a memtable switch happened in the middle and not all the + // data for this snapshot is available. But it will contain all + // the data available in the super version we have, which is also + // a valid snapshot to read from. + // We shouldn't get snapshot before finding and referencing the super + // version because a flush happening in between may compact away data for + // the snapshot, but the snapshot is earlier than the data overwriting it, + // so users may see wrong results. + *snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + } + } else { + // If we end up with the same issue of memtable geting sealed during 2 + // consecutive retries, it means the write rate is very high. In that case + // its probably ok to take the mutex on the 3rd try so we can succeed for + // sure + static const int num_retries = 3; + for (int i = 0; i < num_retries; ++i) { + last_try = (i == num_retries - 1); + bool retry = false; + + if (i > 0) { + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + SuperVersion* super_version = node->super_version; + ColumnFamilyData* cfd = node->cfd; + if (super_version != nullptr) { + ReturnAndCleanupSuperVersion(cfd, super_version); + } + node->super_version = nullptr; + } + } + if (read_options.snapshot == nullptr) { + if (last_try) { + TEST_SYNC_POINT("DBImpl::MultiGet::LastTry"); + // We're close to max number of retries. For the last retry, + // acquire the lock so we're sure to succeed + mutex_.Lock(); + } + *snapshot = last_seq_same_as_publish_seq_ + ? versions_->LastSequence() + : versions_->LastPublishedSequence(); + } else { + *snapshot = reinterpret_cast(read_options.snapshot) + ->number_; + } + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + if (!last_try) { + node->super_version = GetAndRefSuperVersion(node->cfd); + } else { + node->super_version = node->cfd->GetSuperVersion()->Ref(); + } + TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV"); + if (read_options.snapshot != nullptr || last_try) { + // If user passed a snapshot, then we don't care if a memtable is + // sealed or compaction happens because the snapshot would ensure + // that older key versions are kept around. If this is the last + // retry, then we have the lock so nothing bad can happen + continue; + } + // We could get the earliest sequence number for the whole list of + // memtables, which will include immutable memtables as well, but that + // might be tricky to maintain in case we decide, in future, to do + // memtable compaction. + if (!last_try) { + SequenceNumber seq = + node->super_version->mem->GetEarliestSequenceNumber(); + if (seq > *snapshot) { + retry = true; + break; + } + } + } + if (!retry) { + if (last_try) { + mutex_.Unlock(); + } + break; + } + } + } + + // Keep track of bytes that we read for statistics-recording later + PERF_TIMER_STOP(get_snapshot_time); + + return last_try; +} + +void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input) { + if (num_keys == 0) { + return; + } + autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); + for (size_t i = 0; i < num_keys; ++i) { + key_context.emplace_back(column_families[i], keys[i], &values[i], + &statuses[i]); + } + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + + autovector + multiget_cf_data; + size_t cf_start = 0; + ColumnFamilyHandle* cf = sorted_keys[0]->column_family; + for (size_t i = 0; i < num_keys; ++i) { + KeyContext* key_ctx = sorted_keys[i]; + if (key_ctx->column_family != cf) { + multiget_cf_data.emplace_back( + MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr)); + cf_start = i; + cf = key_ctx->column_family; + } + } + { + // multiget_cf_data.emplace_back( + // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr)); + multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr); + } + std::function::iterator&)> + iter_deref_lambda = + [](autovector::iterator& cf_iter) { + return &(*cf_iter); + }; + + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot< + autovector>( + read_options, nullptr, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); + + for (auto cf_iter = multiget_cf_data.begin(); + cf_iter != multiget_cf_data.end(); ++cf_iter) { + MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys, + cf_iter->super_version, consistent_seqnum, nullptr, nullptr); + if (!unref_only) { + ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version); + } else { + cf_iter->cfd->GetSuperVersion()->Unref(); + } + } +} + +namespace { // Order keys by CF ID, followed by key contents struct CompareKeyContext { inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) { - const Comparator* comparator = cfd->user_comparator(); + ColumnFamilyHandleImpl* cfh = + static_cast(lhs->column_family); + uint32_t cfd_id1 = cfh->cfd()->GetID(); + const Comparator* comparator = cfh->cfd()->user_comparator(); + cfh = static_cast(lhs->column_family); + uint32_t cfd_id2 = cfh->cfd()->GetID(); + + if (cfd_id1 < cfd_id2) { + return true; + } else if (cfd_id1 > cfd_id2) { + return false; + } + + // Both keys are from the same column family int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); if (cmp < 0) { return true; } return false; } - const ColumnFamilyData* cfd; }; +} // anonymous namespace + +void DBImpl::PrepareMultiGetKeys( + size_t num_keys, bool sorted_input, + autovector* sorted_keys) { +#ifndef NDEBUG + if (sorted_input) { + for (size_t index = 0; index < sorted_keys->size(); ++index) { + if (index > 0) { + KeyContext* lhs = (*sorted_keys)[index - 1]; + KeyContext* rhs = (*sorted_keys)[index]; + ColumnFamilyHandleImpl* cfh = + reinterpret_cast(lhs->column_family); + uint32_t cfd_id1 = cfh->cfd()->GetID(); + const Comparator* comparator = cfh->cfd()->user_comparator(); + cfh = reinterpret_cast(lhs->column_family); + uint32_t cfd_id2 = cfh->cfd()->GetID(); + + assert(cfd_id1 <= cfd_id2); + if (cfd_id1 < cfd_id2) { + continue; + } + + // Both keys are from the same column family + int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); + assert(cmp <= 0); + } + index++; + } + } +#endif + if (!sorted_input) { + CompareKeyContext sort_comparator; + std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys, + sort_comparator); + } +} + void DBImpl::MultiGet(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { autovector key_context; + autovector sorted_keys; + sorted_keys.resize(num_keys); for (size_t i = 0; i < num_keys; ++i) { - key_context.emplace_back(keys[i], &values[i], &statuses[i]); + key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]); } - - MultiGetImpl(read_options, column_family, key_context, sorted_input, nullptr, - nullptr); + for (size_t i = 0; i < num_keys; ++i) { + sorted_keys[i] = &key_context[i]; + } + PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); + MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys); } -void DBImpl::MultiGetImpl( +void DBImpl::MultiGetWithCallback( const ReadOptions& read_options, ColumnFamilyHandle* column_family, - autovector& key_context, - bool sorted_input, ReadCallback* callback, bool* is_blob_index) { - PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); - StopWatch sw(env_, stats_, DB_MULTIGET); - size_t num_keys = key_context.size(); - - PERF_TIMER_GUARD(get_snapshot_time); - - ColumnFamilyHandleImpl* cfh = - reinterpret_cast(column_family); - ColumnFamilyData* cfd = cfh->cfd(); - - autovector sorted_keys; - sorted_keys.resize(num_keys); - { - size_t index = 0; - for (KeyContext& key : key_context) { + ReadCallback* callback, + autovector* sorted_keys) { + std::array multiget_cf_data; + multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr); + std::function::iterator&)> + iter_deref_lambda = + [](std::array::iterator& cf_iter) { + return &(*cf_iter); + }; + + size_t num_keys = sorted_keys->size(); + SequenceNumber consistent_seqnum; + bool unref_only = MultiCFSnapshot>( + read_options, callback, iter_deref_lambda, &multiget_cf_data, + &consistent_seqnum); #ifndef NDEBUG - if (index > 0 && sorted_input) { - KeyContext* lhs = &key_context[index-1]; - KeyContext* rhs = &key_context[index]; - const Comparator* comparator = cfd->user_comparator(); - int cmp = comparator->Compare(*(lhs->key), *(rhs->key)); - assert(cmp <= 0); - } -#endif - - sorted_keys[index] = &key; - index++; - } - if (!sorted_input) { - CompareKeyContext sort_comparator; - sort_comparator.cfd = cfd; - std::sort(sorted_keys.begin(), sorted_keys.begin() + index, - sort_comparator); - } + assert(!unref_only); +#else + // Silence unused variable warning + (void)unref_only; +#endif // NDEBUG + + if (callback && read_options.snapshot == nullptr) { + // The unprep_seqs are not published for write unprepared, so it could be + // that max_visible_seq is larger. Seek to the std::max of the two. + // However, we still want our callback to contain the actual snapshot so + // that it can do the correct visibility filtering. + callback->Refresh(consistent_seqnum); + + // Internally, WriteUnpreparedTxnReadCallback::Refresh would set + // max_visible_seq = max(max_visible_seq, snapshot) + // + // Currently, the commented out assert is broken by + // InvalidSnapshotReadCallback, but if write unprepared recovery followed + // the regular transaction flow, then this special read callback would not + // be needed. + // + // assert(callback->max_visible_seq() >= snapshot); + consistent_seqnum = callback->max_visible_seq(); } - // Keep track of bytes that we read for statistics-recording later - PERF_TIMER_STOP(get_snapshot_time); + MultiGetImpl(read_options, 0, num_keys, sorted_keys, + multiget_cf_data[0].super_version, consistent_seqnum, nullptr, + nullptr); + ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd, + multiget_cf_data[0].super_version); +} - // Acquire SuperVersion - SuperVersion* super_version = GetAndRefSuperVersion(cfd); - SequenceNumber snapshot; - if (read_options.snapshot != nullptr) { - // Note: In WritePrepared txns this is not necessary but not harmful - // either. Because prep_seq > snapshot => commit_seq > snapshot so if - // a snapshot is specified we should be fine with skipping seq numbers - // that are greater than that. - // - // In WriteUnprepared, we cannot set snapshot in the lookup key because we - // may skip uncommitted data that should be visible to the transaction for - // reading own writes. - snapshot = - reinterpret_cast(read_options.snapshot)->number_; - if (callback) { - snapshot = std::max(snapshot, callback->max_visible_seq()); - } - } else { - // Since we get and reference the super version before getting - // the snapshot number, without a mutex protection, it is possible - // that a memtable switch happened in the middle and not all the - // data for this snapshot is available. But it will contain all - // the data available in the super version we have, which is also - // a valid snapshot to read from. - // We shouldn't get snapshot before finding and referencing the super - // version because a flush happening in between may compact away data for - // the snapshot, but the snapshot is earlier than the data overwriting it, - // so users may see wrong results. - snapshot = last_seq_same_as_publish_seq_ - ? versions_->LastSequence() - : versions_->LastPublishedSequence(); - } +void DBImpl::MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* super_version, SequenceNumber snapshot, + ReadCallback* callback, bool* is_blob_index) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); + StopWatch sw(env_, stats_, DB_MULTIGET); // For each of the given keys, apply the entire "get" process as follows: // First look in the memtable, then in the immutable memtable (if any). @@ -1765,49 +2090,34 @@ void DBImpl::MultiGetImpl( size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE) ? MultiGetContext::MAX_BATCH_SIZE : keys_left; - MultiGetContext ctx(&sorted_keys[num_keys - keys_left], batch_size, - snapshot); + MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left, + batch_size, snapshot); MultiGetRange range = ctx.GetMultiGetRange(); bool lookup_current = false; keys_left -= batch_size; for (auto mget_iter = range.begin(); mget_iter != range.end(); ++mget_iter) { - MergeContext& merge_context = mget_iter->merge_context; - merge_context.Clear(); - Status& s = *mget_iter->s; - PinnableSlice* value = mget_iter->value; - s = Status::OK(); + mget_iter->merge_context.Clear(); + *mget_iter->s = Status::OK(); + } - bool skip_memtable = - (read_options.read_tier == kPersistedTier && - has_unpersisted_data_.load(std::memory_order_relaxed)); - bool done = false; - if (!skip_memtable) { - if (super_version->mem->Get(*(mget_iter->lkey), value->GetSelf(), &s, - &merge_context, - &mget_iter->max_covering_tombstone_seq, - read_options, callback, is_blob_index)) { - done = true; - value->PinSelf(); - RecordTick(stats_, MEMTABLE_HIT); - } else if (super_version->imm->Get( - *(mget_iter->lkey), value->GetSelf(), &s, &merge_context, - &mget_iter->max_covering_tombstone_seq, read_options, - callback, is_blob_index)) { - done = true; - value->PinSelf(); - RecordTick(stats_, MEMTABLE_HIT); - } + bool skip_memtable = + (read_options.read_tier == kPersistedTier && + has_unpersisted_data_.load(std::memory_order_relaxed)); + if (!skip_memtable) { + super_version->mem->MultiGet(read_options, &range, callback, + is_blob_index); + if (!range.empty()) { + super_version->imm->MultiGet(read_options, &range, callback, + is_blob_index); } - if (done) { - range.MarkKeyDone(mget_iter); - } else { - RecordTick(stats_, MEMTABLE_MISS); + if (!range.empty()) { lookup_current = true; + uint64_t left = range.KeysLeft(); + RecordTick(stats_, MEMTABLE_MISS, left); } } - if (lookup_current) { PERF_TIMER_GUARD(get_from_output_files_time); super_version->current->MultiGet(read_options, &range, callback, @@ -1819,15 +2129,14 @@ void DBImpl::MultiGetImpl( PERF_TIMER_GUARD(get_post_process_time); size_t num_found = 0; uint64_t bytes_read = 0; - for (KeyContext& key : key_context) { - if (key.s->ok()) { - bytes_read += key.value->size(); + for (size_t i = start_key; i < start_key + num_keys; ++i) { + KeyContext* key = (*sorted_keys)[i]; + if (key->s->ok()) { + bytes_read += key->value->size(); num_found++; } } - ReturnAndCleanupSuperVersion(cfd, super_version); - RecordTick(stats_, NUMBER_MULTIGET_CALLS); RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); @@ -1912,13 +2221,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, Status persist_options_status; *handle = nullptr; - s = CheckCompressionSupported(cf_options); - if (s.ok() && immutable_db_options_.allow_concurrent_memtable_write) { - s = CheckConcurrentWritesSupported(cf_options); - } - if (s.ok()) { - s = CheckCFPathsSupported(initial_db_options_, cf_options); - } + DBOptions db_options = + BuildDBOptions(immutable_db_options_, mutable_db_options_); + s = ColumnFamilyData::ValidateOptions(db_options, cf_options); if (s.ok()) { for (auto& cf_path : cf_options.cf_paths) { s = env_->CreateDirIfMissing(cf_path.path); @@ -2105,7 +2410,11 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, ReadOptions roptions = read_options; roptions.read_tier = kBlockCacheTier; // read from block cache only PinnableSlice pinnable_val; - auto s = GetImpl(roptions, column_family, key, &pinnable_val, value_found); + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = value_found; + auto s = GetImpl(roptions, key, get_impl_options); value->assign(pinnable_val.data(), pinnable_val.size()); // If block_cache is enabled and the index block of the table didn't @@ -2671,11 +2980,13 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, ReturnAndCleanupSuperVersion(cfd, sv); } -void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, - const Range* range, int n, uint64_t* sizes, - uint8_t include_flags) { - assert(include_flags & DB::SizeApproximationFlags::INCLUDE_FILES || - include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES); +Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, uint64_t* sizes) { + if (!options.include_memtabtles && !options.include_files) { + return Status::InvalidArgument("Invalid options"); + } + Version* v; auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); @@ -2687,16 +2998,19 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); sizes[i] = 0; - if (include_flags & DB::SizeApproximationFlags::INCLUDE_FILES) { - sizes[i] += versions_->ApproximateSize(v, k1.Encode(), k2.Encode()); + if (options.include_files) { + sizes[i] += versions_->ApproximateSize( + options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); } - if (include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES) { + if (options.include_memtabtles) { sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size; sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size; } } ReturnAndCleanupSuperVersion(cfd, sv); + return Status::OK(); } std::list::iterator @@ -2711,8 +3025,11 @@ DBImpl::CaptureCurrentFileNumberInPendingOutputs() { } void DBImpl::ReleaseFileNumberFromPendingOutputs( - std::list::iterator v) { - pending_outputs_.erase(v); + std::unique_ptr::iterator>& v) { + if (v.get() != nullptr) { + pending_outputs_.erase(*v.get()); + v.reset(); + } } #ifndef ROCKSDB_LITE @@ -2924,7 +3241,19 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, assert(column_family); auto* cfd = reinterpret_cast(column_family)->cfd(); auto* sv = GetAndRefSuperVersion(cfd); - sv->current->GetColumnFamilyMetaData(cf_meta); + { + // Without mutex, Version::GetColumnFamilyMetaData will have data race with + // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but + // this may cause regression. An alternative is to make + // FileMetaData::being_compacted atomic, but it will make FileMetaData + // non-copy-able. Another option is to separate these variables from + // original FileMetaData struct, and this requires re-organization of data + // structures. For now, we take the easy approach. If + // DB::GetColumnFamilyMetaData is not called frequently, the regression + // should not be big. We still need to keep an eye on it. + InstrumentedMutexLock l(&mutex_); + sv->current->GetColumnFamilyMetaData(cf_meta); + } ReturnAndCleanupSuperVersion(cfd, sv); } @@ -2934,6 +3263,7 @@ Status DBImpl::CheckConsistency() { mutex_.AssertHeld(); std::vector metadata; versions_->GetLiveFilesMetaData(&metadata); + TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData"); std::string corruption_messages; for (const auto& md : metadata) { @@ -2941,6 +3271,7 @@ Status DBImpl::CheckConsistency() { std::string file_path = md.db_path + md.name; uint64_t fsize = 0; + TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize"); Status s = env_->GetFileSize(file_path, &fsize); if (!s.ok() && env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) { @@ -2964,6 +3295,11 @@ Status DBImpl::CheckConsistency() { } Status DBImpl::GetDbIdentity(std::string& identity) const { + identity.assign(db_id_); + return Status::OK(); +} + +Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const { std::string idfilename = IdentityFileName(dbname_); const EnvOptions soptions; std::unique_ptr id_file_reader; @@ -2990,10 +3326,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) const { if (!s.ok()) { return s; } - identity.assign(id.ToString()); + identity->assign(id.ToString()); // If last character is '\n' remove it from identity - if (identity.size() > 0 && identity.back() == '\n') { - identity.pop_back(); + if (identity->size() > 0 && identity->back() == '\n') { + identity->pop_back(); } return s; } @@ -3036,6 +3372,14 @@ DB::~DB() {} Status DBImpl::Close() { if (!closed_) { + { + InstrumentedMutexLock l(&mutex_); + // If there is unreleased snapshot, fail the close call + if (!snapshots_.empty()) { + return Status::Aborted("Cannot close DB with unreleased snapshot."); + } + } + closed_ = true; return CloseImpl(); } @@ -3055,6 +3399,7 @@ Status DestroyDB(const std::string& dbname, const Options& options, ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); Env* env = soptions.env; std::vector filenames; + bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions); // Reset the logger because it holds a handle to the // log file and prevents cleanup and directory removal @@ -3077,7 +3422,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, if (type == kMetaDatabase) { del = DestroyDB(path_to_delete, options); } else if (type == kTableFile || type == kLogFile) { - del = DeleteDBFile(&soptions, path_to_delete, dbname); + del = DeleteDBFile(&soptions, path_to_delete, dbname, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); } else { del = env->DeleteFile(path_to_delete); } @@ -3111,7 +3457,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, if (ParseFileName(fname, &number, &type) && type == kTableFile) { // Lock file will be deleted at end std::string table_path = path + "/" + fname; - Status del = DeleteDBFile(&soptions, table_path, dbname); + Status del = DeleteDBFile(&soptions, table_path, dbname, + /*force_bg=*/false, /*force_fg=*/false); if (result.ok() && !del.ok()) { result = del; } @@ -3138,7 +3485,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, for (const auto& file : archiveFiles) { if (ParseFileName(file, &number, &type) && type == kLogFile) { Status del = - DeleteDBFile(&soptions, archivedir + "/" + file, archivedir); + DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path); if (result.ok() && !del.ok()) { result = del; } @@ -3153,7 +3501,8 @@ Status DestroyDB(const std::string& dbname, const Options& options, if (ParseFileName(file, &number, &type) && type == kLogFile) { Status del = DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), - soptions.wal_dir); + soptions.wal_dir, /*force_bg=*/false, + /*force_fg=*/!wal_in_db_path); if (result.ok() && !del.ok()) { result = del; } @@ -3370,7 +3719,9 @@ SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, #ifndef ROCKSDB_LITE Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, - bool cache_only, SequenceNumber* seq, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, bool* found_record_for_key, bool* is_blob_index) { Status s; @@ -3403,6 +3754,13 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, return Status::OK(); } + SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber(); + if (lower_bound_in_mem != kMaxSequenceNumber && + lower_bound_in_mem < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + // Check if there is a record for this key in the immutable memtables sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, seq, read_options, nullptr /*read_callback*/, is_blob_index); @@ -3422,6 +3780,13 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, return Status::OK(); } + SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber(); + if (lower_bound_in_imm != kMaxSequenceNumber && + lower_bound_in_imm < lower_bound_seq) { + *found_record_for_key = false; + return Status::OK(); + } + // Check if there is a record for this key in the immutable memtables sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq, seq, read_options, @@ -3443,6 +3808,10 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, return Status::OK(); } + // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true) + // check here to skip the history if possible. But currently the caller + // already does that. Maybe we should move the logic here later. + // TODO(agiardullo): possible optimization: consider checking cached // SST files if cache_only=true? if (!cache_only) { @@ -3511,7 +3880,7 @@ Status DBImpl::IngestExternalFiles( // TODO (yanqin) maybe handle the case in which column_families have // duplicates - std::list::iterator pending_output_elem; + std::unique_ptr::iterator> pending_output_elem; size_t total = 0; for (const auto& arg : args) { total += arg.external_files.size(); @@ -3519,7 +3888,7 @@ Status DBImpl::IngestExternalFiles( uint64_t next_file_number = 0; Status status = ReserveFileNumbersBeforeIngestion( static_cast(args[0].column_family)->cfd(), total, - &pending_output_elem, &next_file_number); + pending_output_elem, &next_file_number); if (!status.ok()) { InstrumentedMutexLock l(&mutex_); ReleaseFileNumberFromPendingOutputs(pending_output_elem); @@ -3529,18 +3898,18 @@ Status DBImpl::IngestExternalFiles( std::vector ingestion_jobs; for (const auto& arg : args) { auto* cfd = static_cast(arg.column_family)->cfd(); - ingestion_jobs.emplace_back(env_, versions_.get(), cfd, - immutable_db_options_, env_options_, - &snapshots_, arg.options); + ingestion_jobs.emplace_back( + env_, versions_.get(), cfd, immutable_db_options_, env_options_, + &snapshots_, arg.options, &directories_, &event_logger_); } std::vector> exec_results; for (size_t i = 0; i != num_cfs; ++i) { exec_results.emplace_back(false, Status::OK()); } // TODO(yanqin) maybe make jobs run in parallel + uint64_t start_file_number = next_file_number; for (size_t i = 1; i != num_cfs; ++i) { - uint64_t start_file_number = - next_file_number + args[i - 1].external_files.size(); + start_file_number += args[i - 1].external_files.size(); auto* cfd = static_cast(args[i].column_family)->cfd(); SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); @@ -3661,19 +4030,21 @@ Status DBImpl::IngestExternalFiles( } } if (status.ok()) { - bool should_increment_last_seqno = - ingestion_jobs[0].ShouldIncrementLastSequence(); + int consumed_seqno_count = + ingestion_jobs[0].ConsumedSequenceNumbersCount(); #ifndef NDEBUG for (size_t i = 1; i != num_cfs; ++i) { - assert(should_increment_last_seqno == - ingestion_jobs[i].ShouldIncrementLastSequence()); + assert(!!consumed_seqno_count == + !!ingestion_jobs[i].ConsumedSequenceNumbersCount()); + consumed_seqno_count += + ingestion_jobs[i].ConsumedSequenceNumbersCount(); } #endif - if (should_increment_last_seqno) { + if (consumed_seqno_count > 0) { const SequenceNumber last_seqno = versions_->LastSequence(); - versions_->SetLastAllocatedSequence(last_seqno + 1); - versions_->SetLastPublishedSequence(last_seqno + 1); - versions_->SetLastSequence(last_seqno + 1); + versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count); + versions_->SetLastSequence(last_seqno + consumed_seqno_count); } autovector cfds_to_commit; autovector mutable_cf_options_list; @@ -3764,7 +4135,127 @@ Status DBImpl::IngestExternalFiles( return status; } -Status DBImpl::VerifyChecksum() { +Status DBImpl::CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { + assert(handle != nullptr); + assert(*handle == nullptr); + std::string cf_comparator_name = options.comparator->Name(); + if (cf_comparator_name != metadata.db_comparator_name) { + return Status::InvalidArgument("Comparator name mismatch"); + } + + // Create column family. + auto status = CreateColumnFamily(options, column_family_name, handle); + if (!status.ok()) { + return status; + } + + // Import sst files from metadata. + auto cfh = reinterpret_cast(*handle); + auto cfd = cfh->cfd(); + ImportColumnFamilyJob import_job(env_, versions_.get(), cfd, + immutable_db_options_, env_options_, + import_options, metadata.files); + + SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); + VersionEdit dummy_edit; + uint64_t next_file_number = 0; + std::unique_ptr::iterator> pending_output_elem; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + if (error_handler_.IsDBStopped()) { + // Don't import files when there is a bg_error + status = error_handler_.GetBGError(); + } + + // Make sure that bg cleanup wont delete the files that we are importing + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); + + if (status.ok()) { + // If crash happen after a hard link established, Recover function may + // reuse the file number that has already assigned to the internal file, + // and this will overwrite the external file. To protect the external + // file, we have to make sure the file number will never being reused. + next_file_number = versions_->FetchAddFileNumber(metadata.files.size()); + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, + directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); + } + } + } + dummy_sv_ctx.Clean(); + + if (status.ok()) { + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + status = import_job.Prepare(next_file_number, sv); + CleanupSuperVersion(sv); + } + + if (status.ok()) { + SuperVersionContext sv_context(true /*create_superversion*/); + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + + // Stop writes to the DB by entering both write threads + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + num_running_ingest_file_++; + assert(!cfd->IsDropped()); + status = import_job.Run(); + + // Install job edit [Mutex will be unlocked here] + if (status.ok()) { + auto cf_options = cfd->GetLatestMutableCFOptions(); + status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(), + &mutex_, directories_.GetDbDir()); + if (status.ok()) { + InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); + } + } + + // Resume writes to the DB + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); + + num_running_ingest_file_--; + if (num_running_ingest_file_ == 0) { + bg_cv_.SignalAll(); + } + } + // mutex_ is unlocked here + + sv_context.Clean(); + } + + { + InstrumentedMutexLock l(&mutex_); + ReleaseFileNumberFromPendingOutputs(pending_output_elem); + } + + import_job.Cleanup(status); + if (!status.ok()) { + DropColumnFamily(*handle); + DestroyColumnFamilyHandle(*handle); + *handle = nullptr; + } + return status; +} + +Status DBImpl::VerifyChecksum(const ReadOptions& read_options) { Status s; std::vector cfd_list; { @@ -3795,7 +4286,8 @@ Status DBImpl::VerifyChecksum() { const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd; std::string fname = TableFileName(cfd->ioptions()->cf_paths, fd.GetNumber(), fd.GetPathId()); - s = rocksdb::VerifySstFileChecksum(opts, env_options_, fname); + s = rocksdb::VerifySstFileChecksum(opts, env_options_, read_options, + fname); } } if (!s.ok()) { @@ -3862,6 +4354,18 @@ Status DBImpl::EndTrace() { return s; } +Status DBImpl::StartBlockCacheTrace( + const TraceOptions& trace_options, + std::unique_ptr&& trace_writer) { + return block_cache_tracer_.StartTrace(env_, trace_options, + std::move(trace_writer)); +} + +Status DBImpl::EndBlockCacheTrace() { + block_cache_tracer_.EndTrace(); + return Status::OK(); +} + Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) { Status s; if (tracer_) { @@ -3887,18 +4391,18 @@ Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, Status DBImpl::ReserveFileNumbersBeforeIngestion( ColumnFamilyData* cfd, uint64_t num, - std::list::iterator* pending_output_elem, + std::unique_ptr::iterator>& pending_output_elem, uint64_t* next_file_number) { Status s; SuperVersionContext dummy_sv_ctx(true /* create_superversion */); - assert(nullptr != pending_output_elem); assert(nullptr != next_file_number); InstrumentedMutexLock l(&mutex_); if (error_handler_.IsDBStopped()) { // Do not ingest files when there is a bg_error return error_handler_.GetBGError(); } - *pending_output_elem = CaptureCurrentFileNumberInPendingOutputs(); + pending_output_elem.reset(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); *next_file_number = versions_->FetchAddFileNumber(static_cast(num)); auto cf_options = cfd->GetLatestMutableCFOptions(); VersionEdit dummy_edit; @@ -3914,6 +4418,26 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( dummy_sv_ctx.Clean(); return s; } + +Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + if (mutable_db_options_.max_open_files == -1) { + uint64_t oldest_time = port::kMaxUint64; + for (auto cfd : *versions_->GetColumnFamilySet()) { + uint64_t ctime; + cfd->current()->GetCreationTimeOfOldestFile(&ctime); + if (ctime < oldest_time) { + oldest_time = ctime; + } + if (oldest_time == 0) { + break; + } + } + *creation_time = oldest_time; + return Status::OK(); + } else { + return Status::NotSupported("This API only works if max_open_files = -1"); + } +} #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_impl.h b/db/db_impl/db_impl.h similarity index 81% rename from db/db_impl.h rename to db/db_impl/db_impl.h index 623f69ba6ef..fe97e08bec1 100644 --- a/db/db_impl.h +++ b/db/db_impl/db_impl.h @@ -20,26 +20,29 @@ #include #include "db/column_family.h" -#include "db/compaction_job.h" +#include "db/compaction/compaction_job.h" #include "db/dbformat.h" #include "db/error_handler.h" #include "db/event_helpers.h" #include "db/external_sst_file_ingestion_job.h" #include "db/flush_job.h" #include "db/flush_scheduler.h" +#include "db/import_column_family_job.h" #include "db/internal_stats.h" #include "db/log_writer.h" #include "db/logs_with_prep_tracker.h" +#include "db/memtable_list.h" #include "db/pre_release_callback.h" #include "db/range_del_aggregator.h" #include "db/read_callback.h" #include "db/snapshot_checker.h" #include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" #include "db/version_edit.h" #include "db/wal_manager.h" #include "db/write_controller.h" #include "db/write_thread.h" -#include "memtable_list.h" +#include "logging/event_logger.h" #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" #include "port/port.h" @@ -51,13 +54,13 @@ #include "rocksdb/transaction_log.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" +#include "trace_replay/block_cache_tracer.h" +#include "trace_replay/trace_replay.h" #include "util/autovector.h" -#include "util/event_logger.h" #include "util/hash.h" #include "util/repeatable_thread.h" #include "util/stop_watch.h" #include "util/thread_local.h" -#include "util/trace_replay.h" namespace rocksdb { @@ -65,6 +68,7 @@ class Arena; class ArenaWrappedDBIter; class InMemoryStatsHistoryIterator; class MemTable; +class PersistentStatsHistoryIterator; class TableCache; class TaskLimiterToken; class Version; @@ -75,16 +79,66 @@ struct JobContext; struct ExternalSstFileInfo; struct MemTableInfo; +// Class to maintain directories for all database paths other than main one. +class Directories { + public: + Status SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); + + Directory* GetDataDir(size_t path_id) const { + assert(path_id < data_dirs_.size()); + Directory* ret_dir = data_dirs_[path_id].get(); + if (ret_dir == nullptr) { + // Should use db_dir_ + return db_dir_.get(); + } + return ret_dir; + } + + Directory* GetWalDir() { + if (wal_dir_) { + return wal_dir_.get(); + } + return db_dir_.get(); + } + + Directory* GetDbDir() { return db_dir_.get(); } + + private: + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; +}; + +// While DB is the public interface of RocksDB, and DBImpl is the actual +// class implementing it. It's the entrance of the core RocksdB engine. +// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a +// DBImpl internally. +// Other than functions implementing the DB interface, some public +// functions are there for other internal components to call. For +// example, TransactionDB directly calls DBImpl::WriteImpl() and +// BlobDB directly calls DBImpl::GetImpl(). Some other functions +// are for sub-components to call. For example, ColumnFamilyHandleImpl +// calls DBImpl::FindObsoleteFiles(). +// +// Since it's a very large class, the definition of the functions is +// divided in several db_impl_*.cc files, besides db_impl.cc. class DBImpl : public DB { public: DBImpl(const DBOptions& options, const std::string& dbname, const bool seq_per_batch = false, const bool batch_per_txn = true); + // No copying allowed + DBImpl(const DBImpl&) = delete; + void operator=(const DBImpl&) = delete; + virtual ~DBImpl(); + // ---- Implementations of the DB interface ---- + using DB::Resume; virtual Status Resume() override; - // Implementations of the DB interface using DB::Put; virtual Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, const Slice& key, @@ -110,12 +164,20 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; - // Function that Get and KeyMayExist call with no_io true or false - // Note: 'value_found' from KeyMayExist propagates here - Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value, - bool* value_found = nullptr, ReadCallback* callback = nullptr, - bool* is_blob_index = nullptr); + using DB::GetMergeOperands; + Status GetMergeOperands(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + GetImplOptions get_impl_options; + get_impl_options.column_family = column_family; + get_impl_options.merge_operands = merge_operands; + get_impl_options.get_merge_operands_options = get_merge_operands_options; + get_impl_options.number_of_operands = number_of_operands; + get_impl_options.get_value = false; + return GetImpl(options, key, get_impl_options); + } using DB::MultiGet; virtual std::vector MultiGet( @@ -137,11 +199,15 @@ class DBImpl : public DB { PinnableSlice* values, Status* statuses, const bool sorted_input = false) override; - void MultiGetImpl( + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + + virtual void MultiGetWithCallback( const ReadOptions& options, ColumnFamilyHandle* column_family, - autovector& key_context, - bool sorted_input, ReadCallback* callback = nullptr, - bool* is_blob_index = nullptr); + ReadCallback* callback, + autovector* sorted_keys); virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family, @@ -174,12 +240,6 @@ class DBImpl : public DB { const ReadOptions& options, const std::vector& column_families, std::vector* iterators) override; - ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, - ColumnFamilyData* cfd, - SequenceNumber snapshot, - ReadCallback* read_callback, - bool allow_blob = false, - bool allow_refresh = true); virtual const Snapshot* GetSnapshot() override; virtual void ReleaseSnapshot(const Snapshot* snapshot) override; @@ -197,9 +257,10 @@ class DBImpl : public DB { virtual bool GetAggregatedIntProperty(const Slice& property, uint64_t* aggregated_value) override; using DB::GetApproximateSizes; - virtual void GetApproximateSizes( - ColumnFamilyHandle* column_family, const Range* range, int n, - uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) override; + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) override; using DB::GetApproximateMemTableStats; virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family, const Range& range, @@ -225,6 +286,9 @@ class DBImpl : public DB { virtual Status EnableAutoCompaction( const std::vector& column_family_handles) override; + virtual void EnableManualCompaction() override; + virtual void DisableManualCompaction() override; + using DB::SetOptions; Status SetOptions( ColumnFamilyHandle* column_family, @@ -259,23 +323,23 @@ class DBImpl : public DB { virtual Status UnlockWAL() override; virtual SequenceNumber GetLatestSequenceNumber() const override; - virtual SequenceNumber GetLastPublishedSequence() const { - if (last_seq_same_as_publish_seq_) { - return versions_->LastSequence(); - } else { - return versions_->LastPublishedSequence(); - } - } - // REQUIRES: joined the main write queue if two_write_queues is disabled, and - // the second write queue otherwise. - virtual void SetLastPublishedSequence(SequenceNumber seq); - // Returns LastSequence in last_seq_same_as_publish_seq_ - // mode and LastAllocatedSequence otherwise. This is useful when visiblility - // depends also on data written to the WAL but not to the memtable. - SequenceNumber TEST_GetLastVisibleSequence() const; virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override; + virtual Status GetDbIdentity(std::string& identity) const override; + + virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const; + + ColumnFamilyHandle* DefaultColumnFamily() const override; + + ColumnFamilyHandle* PersistentStatsColumnFamily() const; + + virtual Status Close() override; + + Status GetStatsHistory( + uint64_t start_time, uint64_t end_time, + std::unique_ptr* stats_iterator) override; + #ifndef ROCKSDB_LITE using DB::ResetStats; virtual Status ResetStats() override; @@ -287,6 +351,10 @@ class DBImpl : public DB { uint64_t* manifest_file_size, bool flush_memtable = true) override; virtual Status GetSortedWalFiles(VectorLogPtr& files) override; + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) override; + virtual Status GetCreationTimeOfOldestFile( + uint64_t* creation_time) override; virtual Status GetUpdatesSince( SequenceNumber seq_number, std::unique_ptr* iter, @@ -313,6 +381,105 @@ class DBImpl : public DB { Status PromoteL0(ColumnFamilyHandle* column_family, int target_level) override; + using DB::IngestExternalFile; + virtual Status IngestExternalFile( + ColumnFamilyHandle* column_family, + const std::vector& external_files, + const IngestExternalFileOptions& ingestion_options) override; + + using DB::IngestExternalFiles; + virtual Status IngestExternalFiles( + const std::vector& args) override; + + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override; + + using DB::VerifyChecksum; + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; + + using DB::StartTrace; + virtual Status StartTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndTrace; + virtual Status EndTrace() override; + + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override; + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override; + + using DB::GetPropertiesOfAllTables; + virtual Status GetPropertiesOfAllTables( + ColumnFamilyHandle* column_family, + TablePropertiesCollection* props) override; + virtual Status GetPropertiesOfTablesInRange( + ColumnFamilyHandle* column_family, const Range* range, std::size_t n, + TablePropertiesCollection* props) override; + +#endif // ROCKSDB_LITE + + // ---- End of implementations of the DB interface ---- + + struct GetImplOptions { + ColumnFamilyHandle* column_family = nullptr; + PinnableSlice* value = nullptr; + bool* value_found = nullptr; + ReadCallback* callback = nullptr; + bool* is_blob_index = nullptr; + // If true return value associated with key via value pointer else return + // all merge operands for key via merge_operands pointer + bool get_value = true; + // Pointer to an array of size + // get_merge_operands_options.expected_max_number_of_operands allocated by + // user + PinnableSlice* merge_operands = nullptr; + GetMergeOperandsOptions* get_merge_operands_options = nullptr; + int* number_of_operands = nullptr; + }; + + // Function that Get and KeyMayExist call with no_io true or false + // Note: 'value_found' from KeyMayExist propagates here + // This function is also called by GetMergeOperands + // If get_impl_options.get_value = true get value associated with + // get_impl_options.key via get_impl_options.value + // If get_impl_options.get_value = false get merge operands associated with + // get_impl_options.key via get_impl_options.merge_operands + Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions get_impl_options); + + ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, + ColumnFamilyData* cfd, + SequenceNumber snapshot, + ReadCallback* read_callback, + bool allow_blob = false, + bool allow_refresh = true); + + virtual SequenceNumber GetLastPublishedSequence() const { + if (last_seq_same_as_publish_seq_) { + return versions_->LastSequence(); + } else { + return versions_->LastPublishedSequence(); + } + } + + // REQUIRES: joined the main write queue if two_write_queues is disabled, and + // the second write queue otherwise. + virtual void SetLastPublishedSequence(SequenceNumber seq); + // Returns LastSequence in last_seq_same_as_publish_seq_ + // mode and LastAllocatedSequence otherwise. This is useful when visiblility + // depends also on data written to the WAL but not to the memtable. + SequenceNumber TEST_GetLastVisibleSequence() const; + +#ifndef ROCKSDB_LITE // Similar to Write() but will call the callback once on the single write // thread to determine whether it is safe to perform the write. virtual Status WriteWithCallback(const WriteOptions& write_options, @@ -352,33 +519,20 @@ class DBImpl : public DB { // snapshot, we know that no key could have existing after this snapshot // (since we do not compact keys that have an earlier snapshot). // + // Only records newer than or at `lower_bound_seq` are guaranteed to be + // returned. Memtables and files may not be checked if it only contains data + // older than `lower_bound_seq`. + // // Returns OK or NotFound on success, // other status on unexpected error. // TODO(andrewkr): this API need to be aware of range deletion operations Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, - bool cache_only, SequenceNumber* seq, + bool cache_only, + SequenceNumber lower_bound_seq, + SequenceNumber* seq, bool* found_record_for_key, bool* is_blob_index = nullptr); - using DB::IngestExternalFile; - virtual Status IngestExternalFile( - ColumnFamilyHandle* column_family, - const std::vector& external_files, - const IngestExternalFileOptions& ingestion_options) override; - - using DB::IngestExternalFiles; - virtual Status IngestExternalFiles( - const std::vector& args) override; - - virtual Status VerifyChecksum() override; - - using DB::StartTrace; - virtual Status StartTrace( - const TraceOptions& options, - std::unique_ptr&& trace_writer) override; - - using DB::EndTrace; - virtual Status EndTrace() override; Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key); Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key); #endif // ROCKSDB_LITE @@ -393,8 +547,6 @@ class DBImpl : public DB { // match to our in-memory records virtual Status CheckConsistency(); - virtual Status GetDbIdentity(std::string& identity) const override; - // max_file_num_to_ignore allows bottom level compaction to filter out newly // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will // disable the filtering @@ -416,102 +568,6 @@ class DBImpl : public DB { return &logs_with_prep_tracker_; } -#ifndef NDEBUG - // Extra methods (for testing) that are not in the public DB interface - // Implemented in db_impl_debug.cc - - // Compact any files in the named level that overlap [*begin, *end] - Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, - ColumnFamilyHandle* column_family = nullptr, - bool disallow_trivial_move = false); - - void TEST_SwitchWAL(); - - bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } - - bool TEST_IsLogGettingFlushed() { - return alive_log_files_.begin()->getting_flushed; - } - - Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); - - // Force current memtable contents to be flushed. - Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, - ColumnFamilyHandle* cfh = nullptr); - - // Wait for memtable compaction - Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); - - // Wait for any compaction - // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this - // is only for the special test of CancelledCompactions - Status TEST_WaitForCompact(bool waitUnscheduled = false); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t TEST_MaxNextLevelOverlappingBytes( - ColumnFamilyHandle* column_family = nullptr); - - // Return the current manifest file no. - uint64_t TEST_Current_Manifest_FileNo(); - - // Returns the number that'll be assigned to the next file that's created. - uint64_t TEST_Current_Next_FileNo(); - - // get total level0 file size. Only for testing. - uint64_t TEST_GetLevel0TotalSize(); - - void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, - std::vector>* metadata); - - void TEST_LockMutex(); - - void TEST_UnlockMutex(); - - // REQUIRES: mutex locked - void* TEST_BeginWrite(); - - // REQUIRES: mutex locked - // pass the pointer that you got from TEST_BeginWrite() - void TEST_EndWrite(void* w); - - uint64_t TEST_MaxTotalInMemoryState() const { - return max_total_in_memory_state_; - } - - size_t TEST_LogsToFreeSize(); - - uint64_t TEST_LogfileNumber(); - - uint64_t TEST_total_log_size() const { return total_log_size_; } - - // Returns column family name to ImmutableCFOptions map. - Status TEST_GetAllImmutableCFOptions( - std::unordered_map* iopts_map); - - // Return the lastest MutableCFOptions of a column family - Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, - MutableCFOptions* mutable_cf_options); - - Cache* TEST_table_cache() { return table_cache_.get(); } - - WriteController& TEST_write_controler() { return write_controller_; } - - uint64_t TEST_FindMinLogContainingOutstandingPrep(); - uint64_t TEST_FindMinPrepLogReferencedByMemTable(); - size_t TEST_PreparedSectionCompletedSize(); - size_t TEST_LogsWithPrepSize(); - - int TEST_BGCompactionsAllowed() const; - int TEST_BGFlushesAllowed() const; - size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; - void TEST_WaitForDumpStatsRun(std::function callback) const; - void TEST_WaitForPersistStatsRun(std::function callback) const; - bool TEST_IsPersistentStatsEnabled() const; - size_t TEST_EstiamteStatsHistorySize() const; - -#endif // NDEBUG - struct BGJobLimits { int max_flushes; int max_compactions; @@ -555,12 +611,15 @@ class DBImpl : public DB { void PurgeObsoleteFiles(JobContext& background_contet, bool schedule_only = false); + // Schedule a background job to actually delete obsolete files. void SchedulePurge(); - ColumnFamilyHandle* DefaultColumnFamily() const override; - const SnapshotList& snapshots() const { return snapshots_; } + // load list of snapshots to `snap_vector` that is no newer than `max_seq` + // in ascending order. + // `oldest_write_conflict_snapshot` is filled with the oldest snapshot + // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true. void LoadSnapshots(std::vector* snap_vector, SequenceNumber* oldest_write_conflict_snapshot, const SequenceNumber& max_seq) const { @@ -572,6 +631,10 @@ class DBImpl : public DB { return immutable_db_options_; } + // Cancel all background jobs, including flush, compaction, background + // purging, stats dumping threads, etc. If `wait` = true, wait for the + // running jobs to abort or finish before returning. Otherwise, only + // sends the signals. void CancelAllBackgroundWork(bool wait); // Find Super version and reference it. Based on options, it might return @@ -725,7 +788,7 @@ class DBImpl : public DB { void DeleteAllRecoveredTransactions() { for (auto it = recovered_transactions_.begin(); - it != recovered_transactions_.end(); it++) { + it != recovered_transactions_.end(); ++it) { delete it->second; } recovered_transactions_.clear(); @@ -748,6 +811,8 @@ class DBImpl : public DB { InstrumentedMutex* mutex() const { return &mutex_; } + // Initialize a brand new DB. The DB directory is expected to be empty before + // calling it. Status NewDB(); // This is to be used only by internal rocksdb classes. @@ -756,25 +821,129 @@ class DBImpl : public DB { std::vector* handles, DB** dbptr, const bool seq_per_batch, const bool batch_per_txn); - virtual Status Close() override; static Status CreateAndNewDirectory(Env* env, const std::string& dirname, std::unique_ptr* directory); - // Given a time window, return an iterator for accessing stats history - Status GetStatsHistory( - uint64_t start_time, uint64_t end_time, - std::unique_ptr* stats_iterator) override; - // find stats map from stats_history_ with smallest timestamp in // the range of [start_time, end_time) bool FindStatsByTime(uint64_t start_time, uint64_t end_time, uint64_t* new_time, std::map* stats_map); + // Print information of all tombstones of all iterators to the std::string + // This is only used by ldb. The output might be capped. Tombstones + // printed out are not guaranteed to be in any order. + Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str); + +#ifndef NDEBUG + // Compact any files in the named level that overlap [*begin, *end] + Status TEST_CompactRange(int level, const Slice* begin, const Slice* end, + ColumnFamilyHandle* column_family = nullptr, + bool disallow_trivial_move = false); + + void TEST_SwitchWAL(); + + bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; } + + bool TEST_IsLogGettingFlushed() { + return alive_log_files_.begin()->getting_flushed; + } + + Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr); + + // Force current memtable contents to be flushed. + Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false, + ColumnFamilyHandle* cfh = nullptr); + + Status TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts); + + // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This + // is because in certain cases, we can flush column families, wait for the + // flush to complete, but delete the column family handle before the wait + // finishes. For example in CompactRange. + Status TEST_AtomicFlushMemTables(const autovector& cfds, + const FlushOptions& flush_opts); + + // Wait for memtable compaction + Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); + + // Wait for any compaction + // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this + // is only for the special test of CancelledCompactions + Status TEST_WaitForCompact(bool waitUnscheduled = false); + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t TEST_MaxNextLevelOverlappingBytes( + ColumnFamilyHandle* column_family = nullptr); + + // Return the current manifest file no. + uint64_t TEST_Current_Manifest_FileNo(); + + // Returns the number that'll be assigned to the next file that's created. + uint64_t TEST_Current_Next_FileNo(); + + // get total level0 file size. Only for testing. + uint64_t TEST_GetLevel0TotalSize(); + + void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, + std::vector>* metadata); + + void TEST_LockMutex(); + + void TEST_UnlockMutex(); + + // REQUIRES: mutex locked + void* TEST_BeginWrite(); + + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); + + uint64_t TEST_MaxTotalInMemoryState() const { + return max_total_in_memory_state_; + } + + size_t TEST_LogsToFreeSize(); + + uint64_t TEST_LogfileNumber(); + + uint64_t TEST_total_log_size() const { return total_log_size_; } + + // Returns column family name to ImmutableCFOptions map. + Status TEST_GetAllImmutableCFOptions( + std::unordered_map* iopts_map); + + // Return the lastest MutableCFOptions of a column family + Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, + MutableCFOptions* mutable_cf_options); + + Cache* TEST_table_cache() { return table_cache_.get(); } + + WriteController& TEST_write_controler() { return write_controller_; } + + uint64_t TEST_FindMinLogContainingOutstandingPrep(); + uint64_t TEST_FindMinPrepLogReferencedByMemTable(); + size_t TEST_PreparedSectionCompletedSize(); + size_t TEST_LogsWithPrepSize(); + + int TEST_BGCompactionsAllowed() const; + int TEST_BGFlushesAllowed() const; + size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + void TEST_WaitForDumpStatsRun(std::function callback) const; + void TEST_WaitForPersistStatsRun(std::function callback) const; + bool TEST_IsPersistentStatsEnabled() const; + size_t TEST_EstimateInMemoryStatsHistorySize() const; +#endif // NDEBUG + protected: Env* const env_; const std::string dbname_; + std::string db_id_; std::unique_ptr versions_; // Flag to check whether we allocated and own the info log file bool own_info_log_; @@ -786,6 +955,7 @@ class DBImpl : public DB { recovered_transactions_; std::unique_ptr tracer_; InstrumentedMutex trace_mutex_; + BlockCacheTracer block_cache_tracer_; // State below is protected by mutex_ // With two_write_queues enabled, some of the variables that accessed during @@ -841,11 +1011,11 @@ class DBImpl : public DB { void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, - int job_id, TableProperties prop); + int job_id); - void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta, - const MutableCFOptions& mutable_cf_options, - int job_id, TableProperties prop); + void NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info); void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, const Status& st, @@ -897,14 +1067,32 @@ class DBImpl : public DB { bool disable_memtable = false, uint64_t* seq_used = nullptr); - // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates - // the number of sub-patches. A sub-patch is a subset of the write batch that - // does not have duplicate keys. - Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates, - WriteCallback* callback = nullptr, - uint64_t* log_used = nullptr, uint64_t log_ref = 0, - uint64_t* seq_used = nullptr, size_t batch_cnt = 0, - PreReleaseCallback* pre_release_callback = nullptr); + // Write only to memtables without joining any write queue + Status UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, + uint64_t log_ref, SequenceNumber seq, + const size_t sub_batch_cnt); + + // Whether the batch requires to be assigned with an order + enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder }; + // Whether it requires publishing last sequence or not + enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq }; + + // Join the write_thread to write the batch only to the WAL. It is the + // responsibility of the caller to also write the write batch to the memtable + // if it required. + // + // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder + // indicating the number of sub-batches in my_batch. A sub-patch is a subset + // of the write batch that does not have duplicate keys. When seq_per_batch is + // not set, each key is a separate sub_batch. Otherwise each duplicate key + // marks start of a new sub-batch. + Status WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& options, + WriteBatch* updates, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable); // write cached_recoverable_state_ to memtable if it is not empty // The writer must be the leader in write_thread_ and holding mutex_ @@ -942,6 +1130,8 @@ class DBImpl : public DB { friend class DBTest_ConcurrentFlushWAL_Test; friend class DBTest_MixedSlowdownOptionsStop_Test; friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test; + friend class DBCompactionTest_CompactionDuringShutdown_Test; + friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test; #ifndef NDEBUG friend class DBTest2_ReadCallbackTest_Test; friend class WriteCallbackTest_WriteWithCallbackTest_Test; @@ -949,7 +1139,10 @@ class DBImpl : public DB { friend class DBBlobIndexTest; friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; #endif + struct CompactionState; + struct PrepickedCompaction; + struct PurgeFileInfo; struct WriteContext { SuperVersionContext superversion_context; @@ -958,16 +1151,137 @@ class DBImpl : public DB { explicit WriteContext(bool create_superversion = false) : superversion_context(create_superversion) {} - ~WriteContext() { - superversion_context.Clean(); - for (auto& m : memtables_to_free_) { - delete m; - } - } + ~WriteContext() { + superversion_context.Clean(); + for (auto& m : memtables_to_free_) { + delete m; + } + } + }; + + struct LogFileNumberSize { + explicit LogFileNumberSize(uint64_t _number) : number(_number) {} + void AddSize(uint64_t new_size) { size += new_size; } + uint64_t number; + uint64_t size = 0; + bool getting_flushed = false; + }; + + struct LogWriterNumber { + // pass ownership of _writer + LogWriterNumber(uint64_t _number, log::Writer* _writer) + : number(_number), writer(_writer) {} + + log::Writer* ReleaseWriter() { + auto* w = writer; + writer = nullptr; + return w; + } + Status ClearWriter() { + Status s = writer->WriteBuffer(); + delete writer; + writer = nullptr; + return s; + } + + uint64_t number; + // Visual Studio doesn't support deque's member to be noncopyable because + // of a std::unique_ptr as a member. + log::Writer* writer; // own + // true for some prefix of logs_ + bool getting_synced = false; + }; + + // PurgeFileInfo is a structure to hold information of files to be deleted in + // purge_queue_ + struct PurgeFileInfo { + std::string fname; + std::string dir_to_sync; + FileType type; + uint64_t number; + int job_id; + PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num, + int jid) + : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {} + }; + + // Argument required by background flush thread. + struct BGFlushArg { + BGFlushArg() + : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {} + BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, + SuperVersionContext* superversion_context) + : cfd_(cfd), + max_memtable_id_(max_memtable_id), + superversion_context_(superversion_context) {} + + // Column family to flush. + ColumnFamilyData* cfd_; + // Maximum ID of memtable to flush. In this column family, memtables with + // IDs smaller than this value must be flushed before this flush completes. + uint64_t max_memtable_id_; + // Pointer to a SuperVersionContext object. After flush completes, RocksDB + // installs a new superversion for the column family. This operation + // requires a SuperVersionContext object (currently embedded in JobContext). + SuperVersionContext* superversion_context_; + }; + + // Argument passed to flush thread. + struct FlushThreadArg { + DBImpl* db_; + + Env::Priority thread_pri_; + }; + + // Information for a manual compaction + struct ManualCompactionState { + ColumnFamilyData* cfd; + int input_level; + int output_level; + uint32_t output_path_id; + Status status; + bool done; + bool in_progress; // compaction request being processed? + bool incomplete; // only part of requested range compacted + bool exclusive; // current behavior of only one manual + bool disallow_trivial_move; // Force actual compaction to run + const InternalKey* begin; // nullptr means beginning of key range + const InternalKey* end; // nullptr means end of key range + InternalKey* manual_end; // how far we are compacting + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress + }; + struct PrepickedCompaction { + // background compaction takes ownership of `compaction`. + Compaction* compaction; + // caller retains ownership of `manual_compaction_state` as it is reused + // across background compactions. + ManualCompactionState* manual_compaction_state; // nullptr if non-manual + // task limiter token is requested during compaction picking. + std::unique_ptr task_token; + }; + + struct CompactionArg { + // caller retains ownership of `db`. + DBImpl* db; + // background compaction takes ownership of `prepicked_compaction`. + PrepickedCompaction* prepicked_compaction; }; - struct PrepickedCompaction; - struct PurgeFileInfo; + // Initialize the built-in column family for persistent stats. Depending on + // whether on-disk persistent stats have been enabled before, it may either + // create a new column family and column family handle or just a column family + // handle. + // Required: DB mutex held + Status InitPersistStatsColumnFamily(); + + // Persistent Stats column family has two format version key which are used + // for compatibility check. Write format version if it's created for the + // first time, read format version and check compatibility if recovering + // from disk. This function requires DB mutex held at entrance but may + // release and re-acquire DB mutex in the process. + // Required: DB mutex held + Status PersistentStatsProcessFormatVersion(); Status ResumeImpl(); @@ -1005,7 +1319,8 @@ class DBImpl : public DB { // created between the calls CaptureCurrentFileNumberInPendingOutputs() and // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live // and blocked by any other pending_outputs_ calls) - void ReleaseFileNumberFromPendingOutputs(std::list::iterator v); + void ReleaseFileNumberFromPendingOutputs( + std::unique_ptr::iterator>& v); Status SyncClosedLogs(JobContext* job_context); @@ -1021,34 +1336,6 @@ class DBImpl : public DB { SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, Env::Priority thread_pri); - // Argument required by background flush thread. - struct BGFlushArg { - BGFlushArg() - : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {} - BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, - SuperVersionContext* superversion_context) - : cfd_(cfd), - max_memtable_id_(max_memtable_id), - superversion_context_(superversion_context) {} - - // Column family to flush. - ColumnFamilyData* cfd_; - // Maximum ID of memtable to flush. In this column family, memtables with - // IDs smaller than this value must be flushed before this flush completes. - uint64_t max_memtable_id_; - // Pointer to a SuperVersionContext object. After flush completes, RocksDB - // installs a new superversion for the column family. This operation - // requires a SuperVersionContext object (currently embedded in JobContext). - SuperVersionContext* superversion_context_; - }; - - // Argument passed to flush thread. - struct FlushThreadArg { - DBImpl* db_; - - Env::Priority thread_pri_; - }; - // Flush the memtables of (multiple) column families to multiple files on // persistent storage. Status FlushMemTablesToOutputFiles( @@ -1060,8 +1347,8 @@ class DBImpl : public DB { JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri); // REQUIRES: log_numbers are sorted in ascending order - virtual Status RecoverLogFiles(const std::vector& log_numbers, - SequenceNumber* next_sequence, bool read_only); + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* next_sequence, bool read_only); // The following two methods are used to flush a memtable to // storage. The first one is used at database RecoveryTime (when the @@ -1086,6 +1373,10 @@ class DBImpl : public DB { Status ScheduleFlushes(WriteContext* context); + void MaybeFlushStatsCF(autovector* cfds); + + Status TrimMemtableHistory(WriteContext* context); + Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); void SelectColumnFamiliesForAtomicFlush(autovector* cfds); @@ -1121,6 +1412,32 @@ class DBImpl : public DB { const autovector& flush_memtable_ids, bool resuming_from_bg_err); + inline void WaitForPendingWrites() { + mutex_.AssertHeld(); + // In case of pipelined write is enabled, wait for all pending memtable + // writers. + if (immutable_db_options_.enable_pipelined_write) { + // Memtable writers may call DB::Get in case max_successive_merges > 0, + // which may lock mutex. Unlocking mutex here to avoid deadlock. + mutex_.Unlock(); + write_thread_.WaitForMemTableWriters(); + mutex_.Lock(); + } + + if (!immutable_db_options_.unordered_write) { + // Then the writes are finished before the next write group starts + return; + } + + // Wait for the ones who already wrote to the WAL to finish their + // memtable write. + if (pending_memtable_writes_.load() != 0) { + std::unique_lock guard(switch_mutex_); + switch_cv_.wait(guard, + [&] { return pending_memtable_writes_.load() == 0; }); + } + } + // REQUIRES: mutex locked and in write thread. void AssignAtomicFlushSeq(const autovector& cfds); @@ -1231,7 +1548,7 @@ class DBImpl : public DB { void PrintStatistics(); - size_t EstiamteStatsHistorySize() const; + size_t EstimateInMemoryStatsHistorySize() const; // persist stats to column family "_persistent_stats" void PersistStats(); @@ -1273,6 +1590,135 @@ class DBImpl : public DB { void WaitForBackgroundWork(); + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function. Background threads carry + // sv_context which can have new_superversion already + // allocated. + // All ColumnFamily state changes go through this function. Here we analyze + // the new state and we schedule background work if we detect that the new + // state needs flush or compaction. + void InstallSuperVersionAndScheduleWork( + ColumnFamilyData* cfd, SuperVersionContext* sv_context, + const MutableCFOptions& mutable_cf_options); + + bool GetIntPropertyInternal(ColumnFamilyData* cfd, + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value); + bool GetPropertyHandleOptionsStatistics(std::string* value); + + bool HasPendingManualCompaction(); + bool HasExclusiveManualCompaction(); + void AddManualCompaction(ManualCompactionState* m); + void RemoveManualCompaction(ManualCompactionState* m); + bool ShouldntRunManualCompaction(ManualCompactionState* m); + bool HaveManualCompaction(ColumnFamilyData* cfd); + bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); +#ifndef ROCKSDB_LITE + void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, + const Status& st, + const CompactionJobStats& compaction_job_stats, + const int job_id, const Version* current, + CompactionJobInfo* compaction_job_info) const; + // Reserve the next 'num' file numbers for to-be-ingested external SST files, + // and return the current file_number in 'next_file_number'. + // Write a version edit to the MANIFEST. + Status ReserveFileNumbersBeforeIngestion( + ColumnFamilyData* cfd, uint64_t num, + std::unique_ptr::iterator>& pending_output_elem, + uint64_t* next_file_number); +#endif //! ROCKSDB_LITE + + bool ShouldPurge(uint64_t file_number) const; + void MarkAsGrabbedForPurge(uint64_t file_number); + + size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; + Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } + + Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, + size_t preallocate_block_size, log::Writer** new_log); + + // Validate self-consistency of DB options + static Status ValidateOptions(const DBOptions& db_options); + // Validate self-consistency of DB options and its consistency with cf options + static Status ValidateOptions( + const DBOptions& db_options, + const std::vector& column_families); + + // Utility function to do some debug validation and sort the given vector + // of MultiGet keys + void PrepareMultiGetKeys( + const size_t num_keys, bool sorted, + autovector* key_ptrs); + + // A structure to hold the information required to process MultiGet of keys + // belonging to one column family. For a multi column family MultiGet, there + // will be a container of these objects. + struct MultiGetColumnFamilyData { + ColumnFamilyHandle* cf; + ColumnFamilyData* cfd; + + // For the batched MultiGet which relies on sorted keys, start specifies + // the index of first key belonging to this column family in the sorted + // list. + size_t start; + + // For the batched MultiGet case, num_keys specifies the number of keys + // belonging to this column family in the sorted list + size_t num_keys; + + // SuperVersion for the column family obtained in a manner that ensures a + // consistent view across all column families in the DB + SuperVersion* super_version; + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, + SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(0), + num_keys(0), + super_version(sv) {} + + MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first, + size_t count, SuperVersion* sv) + : cf(column_family), + cfd(static_cast(cf)->cfd()), + start(first), + num_keys(count), + super_version(sv) {} + + MultiGetColumnFamilyData() = default; + }; + + // A common function to obtain a consistent snapshot, which can be implicit + // if the user doesn't specify a snapshot in read_options, across + // multiple column families for MultiGet. It will attempt to get an implicit + // snapshot without acquiring the db_mutes, but will give up after a few + // tries and acquire the mutex if a memtable flush happens. The template + // allows both the batched and non-batched MultiGet to call this with + // either an std::unordered_map or autovector of column families. + // + // If callback is non-null, the callback is refreshed with the snapshot + // sequence number + // + // A return value of true indicates that the SuperVersions were obtained + // from the ColumnFamilyData, whereas false indicates they are thread + // local + template + bool MultiCFSnapshot( + const ReadOptions& read_options, ReadCallback* callback, + std::function& + iter_deref_func, + T* cf_list, SequenceNumber* snapshot); + + // The actual implementation of the batching MultiGet. The caller is expected + // to have acquired the SuperVersion and pass in a snapshot sequence number + // in order to construct the LookupKeys. The start_key and num_keys specify + // the range of keys in the sorted_keys vector for a single column family. + void MultiGetImpl( + const ReadOptions& read_options, size_t start_key, size_t num_keys, + autovector* sorted_keys, + SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback, + bool* is_blob_index); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; @@ -1285,9 +1731,12 @@ class DBImpl : public DB { // logfile_number_. With two_write_queues it also protects alive_log_files_, // and log_empty_. Refer to the definition of each variable below for more // details. + // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and + // mutex_, the order should be first mutex_ and then log_write_mutex_. InstrumentedMutex log_write_mutex_; std::atomic shutting_down_; + std::atomic manual_compaction_paused_; // This condition variable is signaled on these conditions: // * whenever bg_compaction_scheduled_ goes down to 0 // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't @@ -1318,37 +1767,10 @@ class DBImpl : public DB { // expesnive mutex_ lock during WAL write, which update log_empty_. bool log_empty_; - struct LogFileNumberSize { - explicit LogFileNumberSize(uint64_t _number) : number(_number) {} - void AddSize(uint64_t new_size) { size += new_size; } - uint64_t number; - uint64_t size = 0; - bool getting_flushed = false; - }; - struct LogWriterNumber { - // pass ownership of _writer - LogWriterNumber(uint64_t _number, log::Writer* _writer) - : number(_number), writer(_writer) {} + ColumnFamilyHandleImpl* persist_stats_cf_handle_; - log::Writer* ReleaseWriter() { - auto* w = writer; - writer = nullptr; - return w; - } - Status ClearWriter() { - Status s = writer->WriteBuffer(); - delete writer; - writer = nullptr; - return s; - } + bool persistent_stats_cfd_exists_ = true; - uint64_t number; - // Visual Studio doesn't support deque's member to be noncopyable because - // of a std::unique_ptr as a member. - log::Writer* writer; // own - // true for some prefix of logs_ - bool getting_synced = false; - }; // Without two_write_queues, read and writes to alive_log_files_ are // protected by mutex_. However since back() is never popped, and push_back() // is done only from write_thread_, the same thread can access the item @@ -1395,30 +1817,6 @@ class DBImpl : public DB { bool stats_slice_initialized_ = false; - // Class to maintain directories for all database paths other than main one. - class Directories { - public: - Status SetDirectories(Env* env, const std::string& dbname, - const std::string& wal_dir, - const std::vector& data_paths); - - Directory* GetDataDir(size_t path_id) const; - - Directory* GetWalDir() { - if (wal_dir_) { - return wal_dir_.get(); - } - return db_dir_.get(); - } - - Directory* GetDbDir() { return db_dir_.get(); } - - private: - std::unique_ptr db_dir_; - std::vector> data_dirs_; - std::unique_ptr wal_dir_; - }; - Directories directories_; WriteBufferManager* write_buffer_manager_; @@ -1441,6 +1839,8 @@ class DBImpl : public DB { FlushScheduler flush_scheduler_; + TrimHistoryScheduler trim_history_scheduler_; + SnapshotList snapshots_; // For each background job, pending_outputs_ keeps the current file number at @@ -1454,19 +1854,6 @@ class DBImpl : public DB { // State is protected with db mutex. std::list pending_outputs_; - // PurgeFileInfo is a structure to hold information of files to be deleted in - // purge_queue_ - struct PurgeFileInfo { - std::string fname; - std::string dir_to_sync; - FileType type; - uint64_t number; - int job_id; - PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num, - int jid) - : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {} - }; - // flush_queue_ and compaction_queue_ hold column families that we need to // flush and compact, respectively. // A column family is inserted into flush_queue_ when it satisfies condition @@ -1492,12 +1879,12 @@ class DBImpl : public DB { // ColumnFamilyData::pending_compaction_ == true) std::deque compaction_queue_; - // A queue to store filenames of the files to be purged - std::deque purge_queue_; + // A map to store file numbers and filenames of the files to be purged + std::unordered_map purge_files_; // A vector to store the file numbers that have been assigned to certain // JobContext. Current implementation tracks ssts only. - std::vector files_grabbed_for_purge_; + std::unordered_set files_grabbed_for_purge_; // A queue to store log writers to close std::deque logs_to_free_queue_; @@ -1523,42 +1910,8 @@ class DBImpl : public DB { // number of background obsolete file purge jobs, submitted to the HIGH pool int bg_purge_scheduled_; - // Information for a manual compaction - struct ManualCompactionState { - ColumnFamilyData* cfd; - int input_level; - int output_level; - uint32_t output_path_id; - Status status; - bool done; - bool in_progress; // compaction request being processed? - bool incomplete; // only part of requested range compacted - bool exclusive; // current behavior of only one manual - bool disallow_trivial_move; // Force actual compaction to run - const InternalKey* begin; // nullptr means beginning of key range - const InternalKey* end; // nullptr means end of key range - InternalKey* manual_end; // how far we are compacting - InternalKey tmp_storage; // Used to keep track of compaction progress - InternalKey tmp_storage1; // Used to keep track of compaction progress - }; - struct PrepickedCompaction { - // background compaction takes ownership of `compaction`. - Compaction* compaction; - // caller retains ownership of `manual_compaction_state` as it is reused - // across background compactions. - ManualCompactionState* manual_compaction_state; // nullptr if non-manual - // task limiter token is requested during compaction picking. - std::unique_ptr task_token; - }; std::deque manual_compaction_dequeue_; - struct CompactionArg { - // caller retains ownership of `db`. - DBImpl* db; - // background compaction takes ownership of `prepicked_compaction`. - PrepickedCompaction* prepicked_compaction; - }; - // shall we disable deletion of obsolete files // if 0 the deletion is enabled. // if non-zero, files will not be getting deleted @@ -1571,13 +1924,21 @@ class DBImpl : public DB { // corresponding call to PurgeObsoleteFiles has not yet finished. int pending_purge_obsolete_files_; - // last time when DeleteObsoleteFiles with full scan was executed. Originaly + // last time when DeleteObsoleteFiles with full scan was executed. Originally // initialized with startup time. uint64_t delete_obsolete_files_last_run_; // last time stats were dumped to LOG std::atomic last_stats_dump_time_microsec_; + // The thread that wants to switch memtable, can wait on this cv until the + // pending writes to memtable finishes. + std::condition_variable switch_cv_; + // The mutex used by switch_cv_. mutex_ should be acquired beforehand. + std::mutex switch_mutex_; + // Number of threads intending to write to memtable + std::atomic pending_memtable_writes_ = {}; + // Each flush or compaction gets its own job id. this counter makes sure // they're unique std::atomic next_job_id_; @@ -1601,7 +1962,8 @@ class DBImpl : public DB { std::string db_absolute_path_; - // Number of running IngestExternalFile() calls. + // Number of running IngestExternalFile() or CreateColumnFamilyWithImport() + // calls. // REQUIRES: mutex held int num_running_ingest_file_; @@ -1646,68 +2008,6 @@ class DBImpl : public DB { // REQUIRES: mutex locked std::unique_ptr thread_persist_stats_; - // No copying allowed - DBImpl(const DBImpl&); - void operator=(const DBImpl&); - - // Background threads call this function, which is just a wrapper around - // the InstallSuperVersion() function. Background threads carry - // sv_context which can have new_superversion already - // allocated. - // All ColumnFamily state changes go through this function. Here we analyze - // the new state and we schedule background work if we detect that the new - // state needs flush or compaction. - void InstallSuperVersionAndScheduleWork( - ColumnFamilyData* cfd, SuperVersionContext* sv_context, - const MutableCFOptions& mutable_cf_options); - -#ifndef ROCKSDB_LITE - using DB::GetPropertiesOfAllTables; - virtual Status GetPropertiesOfAllTables( - ColumnFamilyHandle* column_family, - TablePropertiesCollection* props) override; - virtual Status GetPropertiesOfTablesInRange( - ColumnFamilyHandle* column_family, const Range* range, std::size_t n, - TablePropertiesCollection* props) override; - -#endif // ROCKSDB_LITE - - bool GetIntPropertyInternal(ColumnFamilyData* cfd, - const DBPropertyInfo& property_info, - bool is_locked, uint64_t* value); - bool GetPropertyHandleOptionsStatistics(std::string* value); - - bool HasPendingManualCompaction(); - bool HasExclusiveManualCompaction(); - void AddManualCompaction(ManualCompactionState* m); - void RemoveManualCompaction(ManualCompactionState* m); - bool ShouldntRunManualCompaction(ManualCompactionState* m); - bool HaveManualCompaction(ColumnFamilyData* cfd); - bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); -#ifndef ROCKSDB_LITE - void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, - const Status& st, - const CompactionJobStats& compaction_job_stats, - const int job_id, const Version* current, - CompactionJobInfo* compaction_job_info) const; - // Reserve the next 'num' file numbers for to-be-ingested external SST files, - // and return the current file_number in 'next_file_number'. - // Write a version edit to the MANIFEST. - Status ReserveFileNumbersBeforeIngestion( - ColumnFamilyData* cfd, uint64_t num, - std::list::iterator* pending_output_elem, - uint64_t* next_file_number); -#endif //! ROCKSDB_LITE - - bool ShouldPurge(uint64_t file_number) const; - void MarkAsGrabbedForPurge(uint64_t file_number); - - size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; - Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; } - - Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, - size_t preallocate_block_size, log::Writer** new_log); - // When set, we use a separate queue for writes that dont write to memtable. // In 2PC these are the writes at Prepare phase. const bool two_write_queues_; @@ -1753,6 +2053,8 @@ class DBImpl : public DB { // results sequentially. Flush results of memtables with lower IDs get // installed to MANIFEST first. InstrumentedCondVar atomic_flush_install_cv_; + + bool wal_in_db_path_; }; extern Options SanitizeOptions(const std::string& db, const Options& src); diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc similarity index 90% rename from db/db_impl_compaction_flush.cc rename to db/db_impl/db_impl_compaction_flush.cc index f16c6111752..b01fdbc965c 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -6,23 +6,21 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include "db/builder.h" #include "db/error_handler.h" #include "db/event_helpers.h" +#include "file/sst_file_manager_impl.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_updater.h" #include "monitoring/thread_status_util.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" #include "util/concurrent_task_limiter_impl.h" -#include "util/sst_file_manager_impl.h" -#include "util/sync_point.h" namespace rocksdb { @@ -110,6 +108,13 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) { if (!s.ok()) { break; } + + if (immutable_db_options_.recycle_log_file_num > 0) { + s = log->Close(); + if (!s.ok()) { + break; + } + } } if (s.ok()) { s = directories_.GetWalDir()->Fsync(); @@ -159,8 +164,7 @@ Status DBImpl::FlushMemTableToOutputFile( #ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. - NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id, - flush_job.GetTableProperties()); + NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id); #endif // ROCKSDB_LITE Status s; @@ -201,15 +205,15 @@ Status DBImpl::FlushMemTableToOutputFile( cfd->current()->storage_info()->LevelSummary(&tmp)); } - if (!s.ok() && !s.IsShutdownInProgress()) { + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { Status new_bg_error = s; error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } if (s.ok()) { #ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. - NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, - job_context->job_id, flush_job.GetTableProperties()); + NotifyOnFlushCompleted(cfd, mutable_cf_options, + flush_job.GetCommittedFlushJobsInfo()); auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); if (sfm) { @@ -228,6 +232,7 @@ Status DBImpl::FlushMemTableToOutputFile( } #endif // ROCKSDB_LITE } + TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish"); return s; } @@ -254,7 +259,7 @@ Status DBImpl::FlushMemTablesToOutputFiles( snapshot_checker, log_buffer, thread_pri); if (!s.ok()) { status = s; - if (!s.IsShutdownInProgress()) { + if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { // At this point, DB is not shutting down, nor is cfd dropped. // Something is wrong, thus we break out of the loop. break; @@ -298,7 +303,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( autovector distinct_output_dirs; autovector distinct_output_dir_paths; - std::vector jobs; + std::vector> jobs; std::vector all_mutable_cf_options; int num_cfs = static_cast(cfds.size()); all_mutable_cf_options.reserve(num_cfs); @@ -325,7 +330,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_); - jobs.emplace_back( + jobs.emplace_back(new FlushJob( dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, env_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, @@ -333,8 +338,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, false /* sync_output_directory */, false /* write_manifest */, - thread_pri); - jobs.back().PickMemTable(); + thread_pri)); + jobs.back()->PickMemTable(); } std::vector file_meta(num_cfs); @@ -346,7 +351,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); // may temporarily unlock and lock the mutex. NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, - job_context->job_id, jobs[i].GetTableProperties()); + job_context->job_id); } #endif /* !ROCKSDB_LITE */ @@ -368,7 +373,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // TODO (yanqin): parallelize jobs with threads. for (int i = 1; i != num_cfs; ++i) { exec_status[i].second = - jobs[i].Run(&logs_with_prep_tracker_, &file_meta[i]); + jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]); exec_status[i].first = true; } if (num_cfs > 1) { @@ -377,15 +382,18 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( TEST_SYNC_POINT( "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"); } + assert(exec_status.size() > 0); + assert(!file_meta.empty()); exec_status[0].second = - jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]); + jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]); exec_status[0].first = true; Status error_status; for (const auto& e : exec_status) { if (!e.second.ok()) { s = e.second; - if (!e.second.IsShutdownInProgress()) { + if (!e.second.IsShutdownInProgress() && + !e.second.IsColumnFamilyDropped()) { // If a flush job did not return OK, and the CF is not dropped, and // the DB is not shutting down, then we have to return this result to // caller later. @@ -397,15 +405,11 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( s = error_status.ok() ? s : error_status; } - // If db is NOT shutting down, and one or more column families have been - // dropped. - // TODO: use separate status code for db shutdown and column family dropped. - if (s.IsShutdownInProgress() && - !shutting_down_.load(std::memory_order_acquire)) { + if (s.IsColumnFamilyDropped()) { s = Status::OK(); } - if (s.ok() || s.IsShutdownInProgress()) { + if (s.ok() || s.IsShutdownInProgress() || s.IsColumnFamilyDropped()) { // Sync on all distinct output directories. for (auto dir : distinct_output_dirs) { if (dir != nullptr) { @@ -422,7 +426,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( auto wait_to_install_func = [&]() { bool ready = true; for (size_t i = 0; i != cfds.size(); ++i) { - const auto& mems = jobs[i].GetMemTables(); + const auto& mems = jobs[i]->GetMemTables(); if (cfds[i]->IsDropped()) { // If the column family is dropped, then do not wait. continue; @@ -463,7 +467,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( autovector mutable_cf_options_list; autovector tmp_file_meta; for (int i = 0; i != num_cfs; ++i) { - const auto& mems = jobs[i].GetMemTables(); + const auto& mems = jobs[i]->GetMemTables(); if (!cfds[i]->IsDropped() && !mems.empty()) { tmp_cfds.emplace_back(cfds[i]); mems_list.emplace_back(&mems); @@ -499,12 +503,13 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( #ifndef ROCKSDB_LITE auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); + assert(all_mutable_cf_options.size() == static_cast(num_cfs)); for (int i = 0; i != num_cfs; ++i) { if (cfds[i]->IsDropped()) { continue; } - NotifyOnFlushCompleted(cfds[i], &file_meta[i], all_mutable_cf_options[i], - job_context->job_id, jobs[i].GetTableProperties()); + NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i], + jobs[i]->GetCommittedFlushJobsInfo()); if (sfm) { std::string file_path = MakeTableFileName( cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber()); @@ -523,17 +528,17 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // Need to undo atomic flush if something went wrong, i.e. s is not OK and // it is not because of CF drop. - if (!s.ok() && !s.IsShutdownInProgress()) { + if (!s.ok() && !s.IsColumnFamilyDropped()) { // Have to cancel the flush jobs that have NOT executed because we need to // unref the versions. for (int i = 0; i != num_cfs; ++i) { if (!exec_status[i].first) { - jobs[i].Cancel(); + jobs[i]->Cancel(); } } for (int i = 0; i != num_cfs; ++i) { if (exec_status[i].first && exec_status[i].second.ok()) { - auto& mems = jobs[i].GetMemTables(); + auto& mems = jobs[i]->GetMemTables(); cfds[i]->imm()->RollbackMemtableFlush(mems, file_meta[i].fd.GetNumber()); } @@ -547,7 +552,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, - int job_id, TableProperties prop) { + int job_id) { #ifndef ROCKSDB_LITE if (immutable_db_options_.listeners.size() == 0U) { return; @@ -565,20 +570,21 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, // release lock while notifying events mutex_.Unlock(); { - FlushJobInfo info; + FlushJobInfo info{}; info.cf_id = cfd->GetID(); info.cf_name = cfd->GetName(); // TODO(yhchiang): make db_paths dynamic in case flush does not // go to L0 in the future. - info.file_path = MakeTableFileName(cfd->ioptions()->cf_paths[0].path, - file_meta->fd.GetNumber()); + const uint64_t file_number = file_meta->fd.GetNumber(); + info.file_path = + MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number); + info.file_number = file_number; info.thread_id = env_->GetThreadID(); info.job_id = job_id; info.triggered_writes_slowdown = triggered_writes_slowdown; info.triggered_writes_stop = triggered_writes_stop; info.smallest_seqno = file_meta->fd.smallest_seqno; info.largest_seqno = file_meta->fd.largest_seqno; - info.table_properties = prop; info.flush_reason = cfd->GetFlushReason(); for (auto listener : immutable_db_options_.listeners) { listener->OnFlushBegin(this, info); @@ -592,15 +598,14 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, (void)file_meta; (void)mutable_cf_options; (void)job_id; - (void)prop; #endif // ROCKSDB_LITE } -void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd, - FileMetaData* file_meta, - const MutableCFOptions& mutable_cf_options, - int job_id, TableProperties prop) { +void DBImpl::NotifyOnFlushCompleted( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + std::list>* flush_jobs_info) { #ifndef ROCKSDB_LITE + assert(flush_jobs_info != nullptr); if (immutable_db_options_.listeners.size() == 0U) { return; } @@ -617,34 +622,22 @@ void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd, // release lock while notifying events mutex_.Unlock(); { - FlushJobInfo info; - info.cf_id = cfd->GetID(); - info.cf_name = cfd->GetName(); - // TODO(yhchiang): make db_paths dynamic in case flush does not - // go to L0 in the future. - info.file_path = MakeTableFileName(cfd->ioptions()->cf_paths[0].path, - file_meta->fd.GetNumber()); - info.thread_id = env_->GetThreadID(); - info.job_id = job_id; - info.triggered_writes_slowdown = triggered_writes_slowdown; - info.triggered_writes_stop = triggered_writes_stop; - info.smallest_seqno = file_meta->fd.smallest_seqno; - info.largest_seqno = file_meta->fd.largest_seqno; - info.table_properties = prop; - info.flush_reason = cfd->GetFlushReason(); - for (auto listener : immutable_db_options_.listeners) { - listener->OnFlushCompleted(this, info); + for (auto& info : *flush_jobs_info) { + info->triggered_writes_slowdown = triggered_writes_slowdown; + info->triggered_writes_stop = triggered_writes_stop; + for (auto listener : immutable_db_options_.listeners) { + listener->OnFlushCompleted(this, *info); + } } + flush_jobs_info->clear(); } mutex_.Lock(); // no need to signal bg_cv_ as it will be signaled at the end of the // flush process. #else (void)cfd; - (void)file_meta; (void)mutable_cf_options; - (void)job_id; - (void)prop; + (void)flush_jobs_info; #endif // ROCKSDB_LITE } @@ -798,29 +791,6 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, return s; } -class SnapshotListFetchCallbackImpl : public SnapshotListFetchCallback { - public: - SnapshotListFetchCallbackImpl(DBImpl* db_impl, Env* env, - uint64_t snap_refresh_nanos, Logger* info_log) - : SnapshotListFetchCallback(env, snap_refresh_nanos), - db_impl_(db_impl), - info_log_(info_log) {} - virtual void Refresh(std::vector* snapshots, - SequenceNumber max) override { - size_t prev = snapshots->size(); - snapshots->clear(); - db_impl_->LoadSnapshots(snapshots, nullptr, max); - size_t now = snapshots->size(); - ROCKS_LOG_DEBUG(info_log_, - "Compaction snapshot count refreshed from %zu to %zu", prev, - now); - } - - private: - DBImpl* db_impl_; - Logger* info_log_; -}; - Status DBImpl::CompactFiles(const CompactionOptions& compact_options, ColumnFamilyHandle* column_family, const std::vector& input_file_names, @@ -913,6 +883,9 @@ Status DBImpl::CompactFilesImpl( if (shutting_down_.load(std::memory_order_acquire)) { return Status::ShutdownInProgress(); } + if (manual_compaction_paused_.load(std::memory_order_acquire)) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } std::unordered_set input_set; for (const auto& file_name : input_file_names) { @@ -987,14 +960,12 @@ Status DBImpl::CompactFilesImpl( GetSnapshotContext(job_context, &snapshot_seqs, &earliest_write_conflict_snapshot, &snapshot_checker); - auto pending_outputs_inserted_elem = - CaptureCurrentFileNumberInPendingOutputs(); + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJobStats compaction_job_stats; - SnapshotListFetchCallbackImpl fetch_callback( - this, env_, c->mutable_cf_options()->snap_refresh_nanos, - immutable_db_options_.info_log.get()); CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, env_options_for_compaction_, versions_.get(), &shutting_down_, @@ -1004,9 +975,7 @@ Status DBImpl::CompactFilesImpl( snapshot_checker, table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, - &compaction_job_stats, Env::Priority::USER, - immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback - : nullptr); + &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_); // Creating a compaction influences the compaction score because the score // takes running compactions into account (by skipping files that are already @@ -1050,8 +1019,14 @@ Status DBImpl::CompactFilesImpl( if (status.ok()) { // Done - } else if (status.IsShutdownInProgress()) { + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down + } else if (status.IsManualCompactionPaused()) { + // Don't report stopping manual compaction as error + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "[%s] [JOB %d] Stopping manual compaction", + c->column_family_data()->GetName().c_str(), + job_context->job_id); } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] [JOB %d] Compaction error: %s", @@ -1122,13 +1097,17 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, if (shutting_down_.load(std::memory_order_acquire)) { return; } + if (c->is_manual_compaction() && + manual_compaction_paused_.load(std::memory_order_acquire)) { + return; + } Version* current = cfd->current(); current->Ref(); // release lock while notifying events mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); { - CompactionJobInfo info; + CompactionJobInfo info{}; info.cf_name = cfd->GetName(); info.status = st; info.thread_id = env_->GetThreadID(); @@ -1141,9 +1120,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, info.compression = c->output_compression(); for (size_t i = 0; i < c->num_input_levels(); ++i) { for (const auto fmd : *c->inputs(i)) { + const FileDescriptor& desc = fmd->fd; + const uint64_t file_number = desc.GetNumber(); auto fn = TableFileName(c->immutable_cf_options()->cf_paths, - fmd->fd.GetNumber(), fmd->fd.GetPathId()); + file_number, desc.GetPathId()); info.input_files.push_back(fn); + info.input_file_infos.push_back(CompactionFileInfo{ + static_cast(i), file_number, fmd->oldest_blob_file_number}); if (info.table_properties.count(fn) == 0) { std::shared_ptr tp; auto s = current->GetTableProperties(&tp, fmd, &fn); @@ -1154,9 +1137,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, } } for (const auto newf : c->edit()->GetNewFiles()) { + const FileMetaData& meta = newf.second; + const FileDescriptor& desc = meta.fd; + const uint64_t file_number = desc.GetNumber(); info.output_files.push_back(TableFileName( - c->immutable_cf_options()->cf_paths, newf.second.fd.GetNumber(), - newf.second.fd.GetPathId())); + c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); + info.output_file_infos.push_back(CompactionFileInfo{ + newf.first, file_number, meta.oldest_blob_file_number}); } for (auto listener : immutable_db_options_.listeners) { listener->OnCompactionBegin(this, info); @@ -1184,13 +1171,17 @@ void DBImpl::NotifyOnCompactionCompleted( if (shutting_down_.load(std::memory_order_acquire)) { return; } + if (c->is_manual_compaction() && + manual_compaction_paused_.load(std::memory_order_acquire)) { + return; + } Version* current = cfd->current(); current->Ref(); // release lock while notifying events mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); { - CompactionJobInfo info; + CompactionJobInfo info{}; BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current, &info); for (auto listener : immutable_db_options_.listeners) { @@ -1265,7 +1256,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, - f->marked_for_compaction); + f->marked_for_compaction, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -1541,20 +1533,51 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, InstrumentedMutexLock guard_lock(&mutex_); WriteThread::Writer w; + WriteThread::Writer nonmem_w; if (!writes_stopped) { write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } } if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { s = SwitchMemtable(cfd, &context); } - if (s.ok()) { if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { flush_memtable_id = cfd->imm()->GetLatestMemTableID(); flush_req.emplace_back(cfd, flush_memtable_id); } + if (immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && cfd_stats != cfd && + !cfd_stats->mem()->IsEmpty()) { + // only force flush stats CF when it will be the only CF lagging + // behind after the current flush + bool stats_cf_flush_needed = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats || loop_cfd == cfd) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + stats_cf_flush_needed = false; + } + } + if (stats_cf_flush_needed) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with manual flush of %s " + "to avoid holding old logs", + cfd->GetName().c_str()); + s = SwitchMemtable(cfd_stats, &context); + flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd_stats, flush_memtable_id); + } + } + } } if (s.ok() && !flush_req.empty()) { @@ -1562,15 +1585,29 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, ColumnFamilyData* loop_cfd = elem.first; loop_cfd->imm()->FlushRequested(); } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto& elem : flush_req) { + ColumnFamilyData* loop_cfd = elem.first; + loop_cfd->Ref(); + } + } SchedulePendingFlush(flush_req, flush_reason); MaybeScheduleFlushOrCompaction(); } if (!writes_stopped) { write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } } } - + TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush"); + TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush"); if (s.ok() && flush_options.wait) { autovector cfds; autovector flush_memtable_ids; @@ -1580,6 +1617,13 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, } s = WaitForFlushMemTables(cfds, flush_memtable_ids, (flush_reason == FlushReason::kErrorRecovery)); + for (auto* tmp_cfd : cfds) { + if (tmp_cfd->Unref()) { + // Only one thread can reach here. + InstrumentedMutexLock lock_guard(&mutex_); + delete tmp_cfd; + } + } } TEST_SYNC_POINT("FlushMemTableFinished"); return s; @@ -1614,8 +1658,12 @@ Status DBImpl::AtomicFlushMemTables( InstrumentedMutexLock guard_lock(&mutex_); WriteThread::Writer w; + WriteThread::Writer nonmem_w; if (!writes_stopped) { write_thread_.EnterUnbatched(&w, &mutex_); + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } } for (auto cfd : column_family_datas) { @@ -1643,6 +1691,15 @@ Status DBImpl::AtomicFlushMemTables( for (auto cfd : cfds) { cfd->imm()->FlushRequested(); } + // If the caller wants to wait for this flush to complete, it indicates + // that the caller expects the ColumnFamilyData not to be free'ed by + // other threads which may drop the column family concurrently. + // Therefore, we increase the cfd's ref count. + if (flush_options.wait) { + for (auto cfd : cfds) { + cfd->Ref(); + } + } GenerateFlushRequest(cfds, &flush_req); SchedulePendingFlush(flush_req, flush_reason); MaybeScheduleFlushOrCompaction(); @@ -1650,10 +1707,13 @@ Status DBImpl::AtomicFlushMemTables( if (!writes_stopped) { write_thread_.ExitUnbatched(&w); + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } } } TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush"); - + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"); if (s.ok() && flush_options.wait) { autovector flush_memtable_ids; for (auto& iter : flush_req) { @@ -1661,6 +1721,13 @@ Status DBImpl::AtomicFlushMemTables( } s = WaitForFlushMemTables(cfds, flush_memtable_ids, (flush_reason == FlushReason::kErrorRecovery)); + for (auto* cfd : cfds) { + if (cfd->Unref()) { + // Only one thread can reach here. + InstrumentedMutexLock lock_guard(&mutex_); + delete cfd; + } + } } return s; } @@ -1695,7 +1762,10 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, cfd->GetName().c_str()); bg_cv_.Wait(); } - if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) { + if (cfd->IsDropped()) { + return Status::ColumnFamilyDropped(); + } + if (shutting_down_.load(std::memory_order_acquire)) { return Status::ShutdownInProgress(); } @@ -1810,6 +1880,14 @@ Status DBImpl::EnableAutoCompaction( return s; } +void DBImpl::DisableManualCompaction() { + manual_compaction_paused_.store(true, std::memory_order_release); +} + +void DBImpl::EnableManualCompaction() { + manual_compaction_paused_.store(false, std::memory_order_release); +} + void DBImpl::MaybeScheduleFlushOrCompaction() { mutex_.AssertHeld(); if (!opened_successfully_) { @@ -1995,7 +2073,7 @@ void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync, FileType type, uint64_t number, int job_id) { mutex_.AssertHeld(); PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id); - purge_queue_.push_back(std::move(file_info)); + purge_files_.insert({{number, std::move(file_info)}}); } void DBImpl::BGWorkFlush(void* arg) { @@ -2004,7 +2082,8 @@ void DBImpl::BGWorkFlush(void* arg) { IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_); TEST_SYNC_POINT("DBImpl::BGWorkFlush"); - reinterpret_cast(fta.db_)->BackgroundCallFlush(fta.thread_pri_); + static_cast_with_check(fta.db_)->BackgroundCallFlush( + fta.thread_pri_); TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); } @@ -2015,7 +2094,7 @@ void DBImpl::BGWorkCompaction(void* arg) { TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); auto prepicked_compaction = static_cast(ca.prepicked_compaction); - reinterpret_cast(ca.db)->BackgroundCallCompaction( + static_cast_with_check(ca.db)->BackgroundCallCompaction( prepicked_compaction, Env::Priority::LOW); delete prepicked_compaction; } @@ -2080,6 +2159,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, autovector bg_flush_args; std::vector& superversion_contexts = job_context->superversion_contexts; + autovector column_families_not_to_flush; while (!flush_queue_.empty()) { // This cfd is already referenced const FlushRequest& flush_req = PopFirstFromFlushQueue(); @@ -2090,9 +2170,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, ColumnFamilyData* cfd = iter.first; if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) { // can't flush this CF, try next one - if (cfd->Unref()) { - delete cfd; - } + column_families_not_to_flush.push_back(cfd); continue; } superversion_contexts.emplace_back(SuperVersionContext(true)); @@ -2120,6 +2198,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, } status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress, job_context, log_buffer, thread_pri); + TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush"); // All the CFDs in the FlushReq must have the same flush reason, so just // grab the first one *reason = bg_flush_args[0].cfd_->GetFlushReason(); @@ -2131,6 +2210,11 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, } } } + for (auto cfd : column_families_not_to_flush) { + if (cfd->Unref()) { + delete cfd; + } + } return status; } @@ -2147,13 +2231,14 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { assert(bg_flush_scheduled_); num_running_flushes_++; - auto pending_outputs_inserted_elem = - CaptureCurrentFileNumberInPendingOutputs(); + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); FlushReason reason; Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer, &reason, thread_pri); - if (!s.ok() && !s.IsShutdownInProgress() && + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() && reason != FlushReason::kErrorRecovery) { // Wait a little bit before retrying background flush in // case this is an environmental problem and we do not want to @@ -2178,7 +2263,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { // If flush failed, we want to delete all temporary files that we might have // created. Thus, we force full scan in FindObsoleteFiles() - FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress()); + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()); // delete unnecessary files if any, this is done outside the mutex if (job_context.HaveSomethingToClean() || job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { @@ -2228,8 +2314,9 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, num_running_compactions_++; - auto pending_outputs_inserted_elem = - CaptureCurrentFileNumberInPendingOutputs(); + std::unique_ptr::iterator> + pending_outputs_inserted_elem(new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); assert((bg_thread_pri == Env::Priority::BOTTOM && bg_bottom_compaction_scheduled_) || @@ -2242,7 +2329,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, mutex_.Unlock(); env_->SleepForMicroseconds(10000); // prevent hot loop mutex_.Lock(); - } else if (!s.ok() && !s.IsShutdownInProgress()) { + } else if (!s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) { // Wait a little bit before retrying background compaction in // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of @@ -2259,6 +2347,11 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, LogFlush(immutable_db_options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); + } else if (s.IsManualCompactionPaused()) { + ManualCompactionState* m = prepicked_compaction->manual_compaction_state; + assert(m); + ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused", + m->cfd->GetName().c_str(), job_context.job_id); } ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); @@ -2266,7 +2359,9 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, // If compaction failed, we want to delete all temporary files that we might // have created (they might not be all recorded in job_context in case of a // failure). Thus, we force full scan in FindObsoleteFiles() - FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress()); + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsManualCompactionPaused() && + !s.IsColumnFamilyDropped()); TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"); // delete unnecessary files if any, this is done outside the mutex @@ -2349,6 +2444,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, if (!error_handler_.IsBGWorkStopped()) { if (shutting_down_.load(std::memory_order_acquire)) { status = Status::ShutdownInProgress(); + } else if (is_manual && + manual_compaction_paused_.load(std::memory_order_acquire)) { + status = Status::Incomplete(Status::SubCode::kManualCompactionPaused); } } else { status = error_handler_.GetBGError(); @@ -2573,7 +2671,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, - f->fd.largest_seqno, f->marked_for_compaction); + f->fd.largest_seqno, f->marked_for_compaction, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time); ROCKS_LOG_BUFFER( log_buffer, @@ -2650,9 +2750,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, GetSnapshotContext(job_context, &snapshot_seqs, &earliest_write_conflict_snapshot, &snapshot_checker); assert(is_snapshot_supported_ || snapshots_.empty()); - SnapshotListFetchCallbackImpl fetch_callback( - this, env_, c->mutable_cf_options()->snap_refresh_nanos, - immutable_db_options_.info_log.get()); CompactionJob compaction_job( job_context->job_id, c.get(), immutable_db_options_, env_options_for_compaction_, versions_.get(), &shutting_down_, @@ -2663,14 +2760,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->report_bg_io_stats, dbname_, &compaction_job_stats, thread_pri, - immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback - : nullptr); + is_manual ? &manual_compaction_paused_ : nullptr); compaction_job.Prepare(); NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); mutex_.Unlock(); + TEST_SYNC_POINT_CALLBACK( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); compaction_job.Run(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); @@ -2702,9 +2800,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, compaction_job_stats, job_context->job_id); } - if (status.ok() || status.IsCompactionTooLarge()) { + if (status.ok() || status.IsCompactionTooLarge() || + status.IsManualCompactionPaused()) { // Done - } else if (status.IsShutdownInProgress()) { + } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down } else { ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", @@ -2788,7 +2887,7 @@ void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) { it = manual_compaction_dequeue_.erase(it); return; } - it++; + ++it; } assert(false); return; @@ -2809,7 +2908,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { bool seen = false; while (it != manual_compaction_dequeue_.end()) { if (m == (*it)) { - it++; + ++it; seen = true; continue; } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { @@ -2818,7 +2917,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { // and (*it) is ahead in the queue and is not yet in progress return true; } - it++; + ++it; } return false; } @@ -2836,7 +2935,7 @@ bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { // in progress return true; } - it++; + ++it; } return false; } @@ -2849,7 +2948,7 @@ bool DBImpl::HasExclusiveManualCompaction() { if ((*it)->exclusive) { return true; } - it++; + ++it; } return false; } @@ -2883,9 +2982,13 @@ void DBImpl::BuildCompactionJobInfo( compaction_job_info->compression = c->output_compression(); for (size_t i = 0; i < c->num_input_levels(); ++i) { for (const auto fmd : *c->inputs(i)) { - auto fn = TableFileName(c->immutable_cf_options()->cf_paths, - fmd->fd.GetNumber(), fmd->fd.GetPathId()); + const FileDescriptor& desc = fmd->fd; + const uint64_t file_number = desc.GetNumber(); + auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number, + desc.GetPathId()); compaction_job_info->input_files.push_back(fn); + compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ + static_cast(i), file_number, fmd->oldest_blob_file_number}); if (compaction_job_info->table_properties.count(fn) == 0) { std::shared_ptr tp; auto s = current->GetTableProperties(&tp, fmd, &fn); @@ -2896,9 +2999,13 @@ void DBImpl::BuildCompactionJobInfo( } } for (const auto& newf : c->edit()->GetNewFiles()) { - compaction_job_info->output_files.push_back( - TableFileName(c->immutable_cf_options()->cf_paths, - newf.second.fd.GetNumber(), newf.second.fd.GetPathId())); + const FileMetaData& meta = newf.second; + const FileDescriptor& desc = meta.fd; + const uint64_t file_number = desc.GetNumber(); + compaction_job_info->output_files.push_back(TableFileName( + c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId())); + compaction_job_info->output_file_infos.push_back(CompactionFileInfo{ + newf.first, file_number, meta.oldest_blob_file_number}); } } #endif @@ -2957,34 +3064,20 @@ void DBImpl::InstallSuperVersionAndScheduleWork( } // ShouldPurge is called by FindObsoleteFiles when doing a full scan, -// and db mutex (mutex_) should already be held. This function performs a -// linear scan of an vector (files_grabbed_for_purge_) in search of a -// certain element. We expect FindObsoleteFiles with full scan to occur once -// every 10 hours by default, and the size of the vector is small. -// Therefore, the cost is affordable even if the mutex is held. +// and db mutex (mutex_) should already be held. // Actually, the current implementation of FindObsoleteFiles with // full_scan=true can issue I/O requests to obtain list of files in // directories, e.g. env_->getChildren while holding db mutex. -// In the future, if we want to reduce the cost of search, we may try to keep -// the vector sorted. bool DBImpl::ShouldPurge(uint64_t file_number) const { - for (auto fn : files_grabbed_for_purge_) { - if (file_number == fn) { - return false; - } - } - for (const auto& purge_file_info : purge_queue_) { - if (purge_file_info.number == file_number) { - return false; - } - } - return true; + return files_grabbed_for_purge_.find(file_number) == + files_grabbed_for_purge_.end() && + purge_files_.find(file_number) == purge_files_.end(); } // MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex // (mutex_) should already be held. void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) { - files_grabbed_for_purge_.emplace_back(file_number); + files_grabbed_for_purge_.insert(file_number); } void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) { diff --git a/db/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc similarity index 90% rename from db/db_impl_debug.cc rename to db/db_impl/db_impl_debug.cc index f558971190e..566c175735a 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -9,12 +9,13 @@ #ifndef NDEBUG -#include "db/db_impl.h" +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "monitoring/thread_status_updater.h" +#include "util/cast_util.h" namespace rocksdb { - uint64_t DBImpl::TEST_GetLevel0TotalSize() { InstrumentedMutexLock l(&mutex_); return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); @@ -104,7 +105,16 @@ Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) { if (cfd == nullptr) { cfd = default_cf_handle_->cfd(); } - return SwitchMemtable(cfd, &write_context); + + if (two_write_queues_) { + WriteThread::Writer nonmem_w; + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + Status s = SwitchMemtable(cfd, &write_context); + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + return s; + } else { + return SwitchMemtable(cfd, &write_context); + } } Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall, @@ -122,6 +132,16 @@ Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall, return FlushMemTable(cfd, fo, FlushReason::kTest); } +Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd, + const FlushOptions& flush_opts) { + return FlushMemTable(cfd, flush_opts, FlushReason::kTest); +} + +Status DBImpl::TEST_AtomicFlushMemTables( + const autovector& cfds, const FlushOptions& flush_opts) { + return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest); +} + Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { @@ -262,8 +282,8 @@ bool DBImpl::TEST_IsPersistentStatsEnabled() const { return thread_persist_stats_ && thread_persist_stats_->IsRunning(); } -size_t DBImpl::TEST_EstiamteStatsHistorySize() const { - return EstiamteStatsHistorySize(); +size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { + return EstimateInMemoryStatsHistorySize(); } } // namespace rocksdb #endif // NDEBUG diff --git a/db/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc similarity index 96% rename from db/db_impl_experimental.cc rename to db/db_impl/db_impl_experimental.cc index 47a880199e2..9a6e85ea6f5 100644 --- a/db/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -7,13 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "db/column_family.h" @@ -132,7 +128,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, - f->marked_for_compaction); + f->marked_for_compaction, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff --git a/db/db_impl_files.cc b/db/db_impl/db_impl_files.cc similarity index 97% rename from db/db_impl_files.cc rename to db/db_impl/db_impl_files.cc index b16cf87947d..1fa2884062c 100644 --- a/db/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -6,18 +6,16 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include #include #include "db/event_helpers.h" #include "db/memtable_list.h" -#include "util/file_util.h" -#include "util/sst_file_manager_impl.h" +#include "file/file_util.h" +#include "file/sst_file_manager_impl.h" +#include "util/autovector.h" namespace rocksdb { @@ -261,7 +259,8 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, Status file_deletion_status; if (type == kTableFile || type == kLogFile) { file_deletion_status = - DeleteDBFile(&immutable_db_options_, fname, path_to_sync); + DeleteDBFile(&immutable_db_options_, fname, path_to_sync, + /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_); } else { file_deletion_status = env_->DeleteFile(fname); } @@ -318,11 +317,9 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { candidate_files.size() + state.sst_delete_files.size() + state.log_delete_files.size() + state.manifest_delete_files.size()); // We may ignore the dbname when generating the file names. - const char* kDumbDbName = ""; for (auto& file : state.sst_delete_files) { candidate_files.emplace_back( - MakeTableFileName(kDumbDbName, file.metadata->fd.GetNumber()), - file.path); + MakeTableFileName(file.metadata->fd.GetNumber()), file.path); if (file.metadata->table_reader_handle) { table_cache_->Release(file.metadata->table_reader_handle); } @@ -331,7 +328,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { for (auto file_num : state.log_delete_files) { if (file_num > 0) { - candidate_files.emplace_back(LogFileName(kDumbDbName, file_num), + candidate_files.emplace_back(LogFileName(file_num), immutable_db_options_.wal_dir); } } @@ -382,6 +379,12 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { } } + // Close WALs before trying to delete them. + for (const auto w : state.logs_to_free) { + // TODO: maybe check the return value of Close. + w->Close(); + } + std::unordered_set files_to_del; for (const auto& candidate_file : candidate_files) { const std::string& to_delete = candidate_file.file_name; @@ -481,11 +484,6 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { } #endif // !ROCKSDB_LITE - for (const auto w : state.logs_to_free) { - // TODO: maybe check the return value of Close. - w->Close(); - } - Status file_deletion_status; if (schedule_only) { InstrumentedMutexLock guard_lock(&mutex_); @@ -499,13 +497,15 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // After purging obsolete files, remove them from files_grabbed_for_purge_. // Use a temporary vector to perform bulk deletion via swap. InstrumentedMutexLock guard_lock(&mutex_); - std::vector tmp; + autovector to_be_removed; for (auto fn : files_grabbed_for_purge_) { - if (files_to_del.count(fn) == 0) { - tmp.emplace_back(fn); + if (files_to_del.count(fn) != 0) { + to_be_removed.emplace_back(fn); } } - files_grabbed_for_purge_.swap(tmp); + for (auto fn : to_be_removed) { + files_grabbed_for_purge_.erase(fn); + } } // Delete old info log files. diff --git a/db/db_impl_open.cc b/db/db_impl/db_impl_open.cc similarity index 82% rename from db/db_impl_open.cc rename to db/db_impl/db_impl_open.cc index 1bc69b49182..9ca0a940cc7 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -6,21 +6,21 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include "db/builder.h" #include "db/error_handler.h" +#include "file/read_write_util.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" +#include "monitoring/persistent_stats_history.h" #include "options/options_helper.h" #include "rocksdb/wal_filter.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" +#include "test_util/sync_point.h" #include "util/rate_limiter.h" -#include "util/sst_file_manager_impl.h" -#include "util/sync_point.h" namespace rocksdb { Options SanitizeOptions(const std::string& dbname, const Options& src) { @@ -124,6 +124,25 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } #ifndef ROCKSDB_LITE + ImmutableDBOptions immutable_db_options(result); + if (!IsWalDirSameAsDBPath(&immutable_db_options)) { + // Either the WAL dir and db_paths[0]/db_name are not the same, or we + // cannot tell for sure. In either case, assume they're different and + // explicitly cleanup the trash log files (bypass DeleteScheduler) + // Do this first so even if we end up calling + // DeleteScheduler::CleanupDirectory on the same dir later, it will be + // safe + std::vector filenames; + result.env->GetChildren(result.wal_dir, &filenames); + for (std::string& filename : filenames) { + if (filename.find(".log.trash", filename.length() - + std::string(".log.trash").length()) != + std::string::npos) { + std::string trash_file = result.wal_dir + "/" + filename; + result.env->DeleteFile(trash_file); + } + } + } // When the DB is stopped, it's possible that there are some .trash files that // were not deleted yet, when we open the DB we will find these .trash files // and schedule them to be deleted (or delete immediately if SstFileManager @@ -145,7 +164,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } namespace { - Status SanitizeOptionsByTable( const DBOptions& db_opts, const std::vector& column_families) { @@ -158,52 +176,23 @@ Status SanitizeOptionsByTable( } return Status::OK(); } +} // namespace -static Status ValidateOptions( +Status DBImpl::ValidateOptions( const DBOptions& db_options, const std::vector& column_families) { Status s; - for (auto& cfd : column_families) { - s = CheckCompressionSupported(cfd.options); - if (s.ok() && db_options.allow_concurrent_memtable_write) { - s = CheckConcurrentWritesSupported(cfd.options); - } - if (s.ok()) { - s = CheckCFPathsSupported(db_options, cfd.options); - } + s = ColumnFamilyData::ValidateOptions(db_options, cfd.options); if (!s.ok()) { return s; } - - if (cfd.options.ttl > 0) { - if (db_options.max_open_files != -1) { - return Status::NotSupported( - "TTL is only supported when files are always " - "kept open (set max_open_files = -1). "); - } - if (cfd.options.table_factory->Name() != - BlockBasedTableFactory().Name()) { - return Status::NotSupported( - "TTL is only supported in Block-Based Table format. "); - } - } - - if (cfd.options.periodic_compaction_seconds > 0) { - if (db_options.max_open_files != -1) { - return Status::NotSupported( - "Periodic Compaction is only supported when files are always " - "kept open (set max_open_files = -1). "); - } - if (cfd.options.table_factory->Name() != - BlockBasedTableFactory().Name()) { - return Status::NotSupported( - "Periodic Compaction is only supported in " - "Block-Based Table format. "); - } - } } + s = ValidateOptions(db_options); + return s; +} +Status DBImpl::ValidateOptions(const DBOptions& db_options) { if (db_options.db_paths.size() > 4) { return Status::NotSupported( "More than four DB paths are not supported yet. "); @@ -228,17 +217,40 @@ static Status ValidateOptions( return Status::InvalidArgument("keep_log_file_num must be greater than 0"); } + if (db_options.unordered_write && + !db_options.allow_concurrent_memtable_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with !allow_concurrent_memtable_write"); + } + + if (db_options.unordered_write && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "unordered_write is incompatible with enable_pipelined_write"); + } + + if (db_options.atomic_flush && db_options.enable_pipelined_write) { + return Status::InvalidArgument( + "atomic_flush is incompatible with enable_pipelined_write"); + } + return Status::OK(); } -} // namespace + Status DBImpl::NewDB() { VersionEdit new_db; + Status s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + if (immutable_db_options_.write_dbid_to_manifest) { + std::string temp_db_id; + GetDbIdentityFromIdentityFile(&temp_db_id); + new_db.SetDBId(temp_db_id); + } new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); - Status s; - ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); { @@ -286,9 +298,9 @@ Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname, return env->NewDirectory(dirname, directory); } -Status DBImpl::Directories::SetDirectories( - Env* env, const std::string& dbname, const std::string& wal_dir, - const std::vector& data_paths) { +Status Directories::SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths) { Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_); if (!s.ok()) { return s; @@ -341,6 +353,9 @@ Status DBImpl::Recover( s = env_->FileExists(CurrentFileName(dbname_)); if (s.IsNotFound()) { if (immutable_db_options_.create_if_missing) { + // Has to be called only after Identity File creation is successful + // because DB ID is stored in Manifest if + // immutable_db_options_.write_dbid_to_manifest = true s = NewDB(); is_new_db = true; if (!s.ok()) { @@ -360,30 +375,19 @@ Status DBImpl::Recover( assert(s.IsIOError()); return s; } - // Check for the IDENTITY file and create it if not there - s = env_->FileExists(IdentityFileName(dbname_)); - if (s.IsNotFound()) { - s = SetIdentityFile(env_, dbname_); - if (!s.ok()) { - return s; - } - } else if (!s.ok()) { - assert(s.IsIOError()); - return s; - } // Verify compatibility of env_options_ and filesystem { std::unique_ptr idfile; EnvOptions customized_env(env_options_); customized_env.use_direct_reads |= immutable_db_options_.use_direct_io_for_flush_and_compaction; - s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile, + s = env_->NewRandomAccessFile(CurrentFileName(dbname_), &idfile, customized_env); if (!s.ok()) { std::string error_str = s.ToString(); // Check if unsupported Direct I/O is the root cause customized_env.use_direct_reads = false; - s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile, + s = env_->NewRandomAccessFile(CurrentFileName(dbname_), &idfile, customized_env); if (s.ok()) { return Status::InvalidArgument( @@ -395,8 +399,43 @@ Status DBImpl::Recover( } } } + assert(db_id_.empty()); + Status s = versions_->Recover(column_families, read_only, &db_id_); + if (!s.ok()) { + return s; + } + // Happens when immutable_db_options_.write_dbid_to_manifest is set to true + // the very first time. + if (db_id_.empty()) { + // Check for the IDENTITY file and create it if not there. + s = env_->FileExists(IdentityFileName(dbname_)); + // Typically Identity file is created in NewDB() and for some reason if + // it is no longer available then at this point DB ID is not in Identity + // file or Manifest. + if (s.IsNotFound()) { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } else if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + GetDbIdentityFromIdentityFile(&db_id_); + if (immutable_db_options_.write_dbid_to_manifest) { + VersionEdit edit; + edit.SetDBId(db_id_); + Options options; + MutableCFOptions mutable_cf_options(options); + versions_->db_id_ = db_id_; + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &edit, &mutex_, nullptr, + false); + } + } else { + SetIdentityFile(env_, dbname_, db_id_); + } - Status s = versions_->Recover(column_families, read_only); if (immutable_db_options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } @@ -408,6 +447,10 @@ Status DBImpl::Recover( } } } + // DB mutex is already held + if (s.ok() && immutable_db_options_.persist_stats_to_disk) { + s = InitPersistStatsColumnFamily(); + } // Initial max_total_in_memory_state_ before recovery logs. Log recovery // may check this value to decide whether to flush. @@ -423,6 +466,8 @@ Status DBImpl::Recover( default_cf_handle_ = new ColumnFamilyHandleImpl( versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); + // TODO(Zhongyi): handle single_column_family_mode_ when + // persistent_stats is enabled single_column_family_mode_ = versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; @@ -518,6 +563,98 @@ Status DBImpl::Recover( return s; } +Status DBImpl::PersistentStatsProcessFormatVersion() { + mutex_.AssertHeld(); + Status s; + // persist version when stats CF doesn't exist + bool should_persist_format_version = !persistent_stats_cfd_exists_; + mutex_.Unlock(); + if (persistent_stats_cfd_exists_) { + // Check persistent stats format version compatibility. Drop and recreate + // persistent stats CF if format version is incompatible + uint64_t format_version_recovered = 0; + Status s_format = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kFormatVersion, &format_version_recovered); + uint64_t compatible_version_recovered = 0; + Status s_compatible = DecodePersistentStatsVersionNumber( + this, StatsVersionKeyType::kCompatibleVersion, + &compatible_version_recovered); + // abort reading from existing stats CF if any of following is true: + // 1. failed to read format version or compatible version from disk + // 2. sst's format version is greater than current format version, meaning + // this sst is encoded with a newer RocksDB release, and current compatible + // version is below the sst's compatible version + if (!s_format.ok() || !s_compatible.ok() || + (kStatsCFCurrentFormatVersion < format_version_recovered && + kStatsCFCompatibleFormatVersion < compatible_version_recovered)) { + if (!s_format.ok() || !s_compatible.ok()) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Reading persistent stats version key failed. Format key: %s, " + "compatible key: %s", + s_format.ToString().c_str(), s_compatible.ToString().c_str()); + } else { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Disable persistent stats due to corrupted or incompatible format " + "version\n"); + } + DropColumnFamily(persist_stats_cf_handle_); + DestroyColumnFamilyHandle(persist_stats_cf_handle_); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + // should also persist version here because old stats CF is discarded + should_persist_format_version = true; + } + } + if (s.ok() && should_persist_format_version) { + // Persistent stats CF being created for the first time, need to write + // format version key + WriteBatch batch; + batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString, + ToString(kStatsCFCurrentFormatVersion)); + batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString, + ToString(kStatsCFCompatibleFormatVersion)); + WriteOptions wo; + wo.low_pri = true; + wo.no_slowdown = true; + wo.sync = false; + s = Write(wo, &batch); + } + mutex_.Lock(); + return s; +} + +Status DBImpl::InitPersistStatsColumnFamily() { + mutex_.AssertHeld(); + assert(!persist_stats_cf_handle_); + ColumnFamilyData* persistent_stats_cfd = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr; + + Status s; + if (persistent_stats_cfd != nullptr) { + // We are recovering from a DB which already contains persistent stats CF, + // the CF is already created in VersionSet::ApplyOneVersionEdit, but + // column family handle was not. Need to explicitly create handle here. + persist_stats_cf_handle_ = + new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_); + } else { + mutex_.Unlock(); + ColumnFamilyHandle* handle = nullptr; + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + persist_stats_cf_handle_ = static_cast(handle); + mutex_.Lock(); + } + return s; +} + // REQUIRES: log_numbers are sorted in ascending order Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, bool read_only) { @@ -577,12 +714,13 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, bool stop_replay_for_corruption = false; bool flushed = false; uint64_t corrupted_log_number = kMaxSequenceNumber; + uint64_t min_log_number = MinLogNumberToKeep(); for (auto log_number : log_numbers) { - if (log_number < versions_->min_log_number_to_keep_2pc()) { + if (log_number < min_log_number) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Skipping log #%" PRIu64 " since it is older than min log to keep #%" PRIu64, - log_number, versions_->min_log_number_to_keep_2pc()); + log_number, min_log_number); continue; } // The previous incarnation may not have written any MANIFEST @@ -623,7 +761,8 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, continue; } } - file_reader.reset(new SequentialFileReader(std::move(file), fname)); + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size)); } // Create the log reader. @@ -763,9 +902,10 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // That's why we set ignore missing column families to true bool has_valid_writes = false; status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), &flush_scheduler_, true, - log_number, this, false /* concurrent_memtable_writes */, - next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); + &batch, column_family_memtables_.get(), &flush_scheduler_, + &trim_history_scheduler_, true, log_number, this, + false /* concurrent_memtable_writes */, next_sequence, + &has_valid_writes, seq_per_batch_, batch_per_txn_); MaybeIgnoreError(&status); if (!status.ok()) { // We are treating this as a failure while reading since we read valid @@ -832,6 +972,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, } flush_scheduler_.Clear(); + trim_history_scheduler_.Clear(); auto last_sequence = *next_sequence - 1; if ((*next_sequence != kMaxSequenceNumber) && (versions_->LastSequence() <= last_sequence)) { @@ -882,6 +1023,9 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, continue; } + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr); + // flush the final memtable (if non-empty) if (cfd->mem()->GetFirstSequenceNumber() != 0) { // If flush happened in the middle of recovery (e.g. due to memtable @@ -901,7 +1045,9 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, data_seen = true; } - // write MANIFEST with update + // Update the log number info in the version edit corresponding to this + // column family. Note that the version edits will be written to MANIFEST + // together later. // writing log_number in the manifest means that any log file // with number strongly less than (log_number + 1) is already // recovered and should be ignored on next reincarnation. @@ -910,17 +1056,28 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) { edit->SetLogNumber(max_log_number + 1); } + } + if (status.ok()) { // we must mark the next log number as used, even though it's // not actually used. that is because VersionSet assumes // VersionSet::next_file_number_ always to be strictly greater than any // log number versions_->MarkFileNumberUsed(max_log_number + 1); - status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - edit, &mutex_); - if (!status.ok()) { - // Recovery failed - break; + + autovector cfds; + autovector cf_opts; + autovector> edit_lists; + for (auto* cfd : *versions_->GetColumnFamilySet()) { + cfds.push_back(cfd); + cf_opts.push_back(cfd->GetLatestMutableCFOptions()); + auto iter = version_edits.find(cfd->GetID()); + assert(iter != version_edits.end()); + edit_lists.push_back({&iter->second}); } + // write MANIFEST with update + status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_, + directories_.GetDbDir(), + /*new_descriptor_log=*/true); } } @@ -993,8 +1150,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; - auto pending_outputs_inserted_elem = - CaptureCurrentFileNumberInPendingOutputs(); + std::unique_ptr::iterator> pending_outputs_inserted_elem( + new std::list::iterator( + CaptureCurrentFileNumberInPendingOutputs())); meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); ReadOptions ro; ro.total_order_seek = true; @@ -1017,6 +1175,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, int64_t _current_time = 0; env_->GetCurrentTime(&_current_time); // ignore error const uint64_t current_time = static_cast(_current_time); + meta.oldest_ancester_time = current_time; { auto write_hint = cfd->CalculateSSTWriteHint(0); @@ -1066,7 +1225,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), meta.smallest, meta.largest, meta.fd.smallest_seqno, meta.fd.largest_seqno, - meta.marked_for_compaction); + meta.marked_for_compaction, meta.oldest_blob_file_number, + meta.oldest_ancester_time, meta.file_creation_time); } InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); @@ -1086,12 +1246,23 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { std::vector column_families; column_families.push_back( ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + if (db_options.persist_stats_to_disk) { + column_families.push_back( + ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options)); + } std::vector handles; Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); if (s.ok()) { - assert(handles.size() == 1); + if (db_options.persist_stats_to_disk) { + assert(handles.size() == 2); + } else { + assert(handles.size() == 1); + } // i can delete the handle since DBImpl is always holding a reference to // default column family + if (db_options.persist_stats_to_disk && handles[1] != nullptr) { + delete handles[1]; + } delete handles[0]; } return s; @@ -1204,6 +1375,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, delete impl; return s; } + + impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_); + impl->mutex_.Lock(); // Handles create_if_missing, error_if_exists s = impl->Recover(column_families); @@ -1268,6 +1442,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, s = impl->directories_.GetDbDir()->Fsync(); } } + if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { + // try to read format version but no need to fail Open() even if it fails + s = impl->PersistentStatsProcessFormatVersion(); + } if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { diff --git a/db/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc similarity index 98% rename from db/db_impl_readonly.cc rename to db/db_impl/db_impl_readonly.cc index 5d7515c28e2..8a8a9c9d051 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -3,10 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/db_impl_readonly.h" +#include "db/db_impl/db_impl_readonly.h" +#include "db/arena_wrapped_db_iter.h" #include "db/compacted_db_impl.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/merge_context.h" #include "monitoring/perf_context_imp.h" diff --git a/db/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h similarity index 89% rename from db/db_impl_readonly.h rename to db/db_impl/db_impl_readonly.h index 23816210dc8..9f7ad17a475 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -9,13 +9,17 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { class DBImplReadOnly : public DBImpl { public: DBImplReadOnly(const DBOptions& options, const std::string& dbname); + // No copying allowed + DBImplReadOnly(const DBImplReadOnly&) = delete; + void operator=(const DBImplReadOnly&) = delete; + virtual ~DBImplReadOnly(); // Implementations of the DB interface @@ -115,12 +119,18 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + private: friend class DB; - - // No copying allowed - DBImplReadOnly(const DBImplReadOnly&); - void operator=(const DBImplReadOnly&); }; } // namespace rocksdb diff --git a/db/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc similarity index 63% rename from db/db_impl_secondary.cc rename to db/db_impl/db_impl_secondary.cc index 90e979b4e58..8eac41dedb8 100644 --- a/db/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -3,22 +3,18 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/db_impl_secondary.h" +#include "db/db_impl/db_impl_secondary.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include -#include "db/db_iter.h" +#include "db/arena_wrapped_db_iter.h" #include "db/merge_context.h" +#include "logging/auto_roll_logger.h" #include "monitoring/perf_context_imp.h" -#include "util/auto_roll_logger.h" namespace rocksdb { #ifndef ROCKSDB_LITE - DBImplSecondary::DBImplSecondary(const DBOptions& db_options, const std::string& dbname) : DBImpl(db_options, dbname) { @@ -35,6 +31,7 @@ Status DBImplSecondary::Recover( bool /*error_if_data_exists_in_logs*/) { mutex_.AssertHeld(); + JobContext job_context(0); Status s; s = static_cast(versions_.get()) ->Recover(column_families, &manifest_reader_, &manifest_reporter_, @@ -59,49 +56,73 @@ Status DBImplSecondary::Recover( single_column_family_mode_ = versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; - // Recover from all newer log files than the ones named in the - // descriptor. - std::vector filenames; - s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); - if (s.IsNotFound()) { - return Status::InvalidArgument("Failed to open wal_dir", - immutable_db_options_.wal_dir); - } else if (!s.ok()) { - return s; - } + std::unordered_set cfds_changed; + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } - std::vector logs; - // if log_readers_ is non-empty, it means we have applied all logs with log - // numbers smaller than the smallest log in log_readers_, so there is no - // need to pass these logs to RecoverLogFiles - uint64_t log_number_min = 0; - if (log_readers_.size() > 0) { - log_number_min = log_readers_.begin()->first; - } - for (size_t i = 0; i < filenames.size(); i++) { - uint64_t number; - FileType type; - if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && - number >= log_number_min) { - logs.push_back(number); - } - } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } + // TODO: update options_file_number_ needed? - if (!logs.empty()) { - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - SequenceNumber next_sequence(kMaxSequenceNumber); - s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/); - } + job_context.Clean(); + return s; +} + +// find new WAL and apply them in order to the secondary instance +Status DBImplSecondary::FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); + Status s; + std::vector logs; + s = FindNewLogNumbers(&logs); + if (s.ok() && !logs.empty()) { + SequenceNumber next_sequence(kMaxSequenceNumber); + s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context); } + return s; +} - // TODO: update options_file_number_ needed? +// List wal_dir and find all new WALs, return these log numbers +Status DBImplSecondary::FindNewLogNumbers(std::vector* logs) { + assert(logs != nullptr); + std::vector filenames; + Status s; + s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames); + if (s.IsNotFound()) { + return Status::InvalidArgument("Failed to open wal_dir", + immutable_db_options_.wal_dir); + } else if (!s.ok()) { + return s; + } + // if log_readers_ is non-empty, it means we have applied all logs with log + // numbers smaller than the smallest log in log_readers_, so there is no + // need to pass these logs to RecoverLogFiles + uint64_t log_number_min = 0; + if (!log_readers_.empty()) { + log_number_min = log_readers_.begin()->first; + } + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == kLogFile && + number >= log_number_min) { + logs->push_back(number); + } + } + // Recover logs in the order that they were generated + if (!logs->empty()) { + std::sort(logs->begin(), logs->end()); + } return s; } -// try to find log reader using log_number from log_readers_ map, initialize -// if it doesn't exist Status DBImplSecondary::MaybeInitLogReader( uint64_t log_number, log::FragmentBufferedReader** log_reader) { auto iter = log_readers_.find(log_number); @@ -129,7 +150,8 @@ Status DBImplSecondary::MaybeInitLogReader( *log_reader = nullptr; return status; } - file_reader.reset(new SequentialFileReader(std::move(file), fname)); + file_reader.reset(new SequentialFileReader( + std::move(file), fname, immutable_db_options_.log_readahead_size)); } // Create the log reader. @@ -149,7 +171,10 @@ Status DBImplSecondary::MaybeInitLogReader( // REQUIRES: log_numbers are sorted in ascending order Status DBImplSecondary::RecoverLogFiles( const std::vector& log_numbers, SequenceNumber* next_sequence, - bool /*read_only*/) { + std::unordered_set* cfds_changed, + JobContext* job_context) { + assert(nullptr != cfds_changed); + assert(nullptr != job_context); mutex_.AssertHeld(); Status status; for (auto log_number : log_numbers) { @@ -182,9 +207,56 @@ Status DBImplSecondary::RecoverLogFiles( continue; } WriteBatchInternal::SetContents(&batch, record); - // do not check sequence number because user may toggle disableWAL - // between writes which breaks sequence number continuity guarantee - + SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); + std::vector column_family_ids; + status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + const std::vector& l0_files = + cfd->current()->storage_info()->LevelFiles(0); + SequenceNumber seq = + l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno; + // If the write batch's sequence number is smaller than the last + // sequence number of the largest sequence persisted for this column + // family, then its data must reside in an SST that has already been + // added in the prior MANIFEST replay. + if (seq_of_batch <= seq) { + continue; + } + auto curr_log_num = port::kMaxUint64; + if (cfd_to_current_log_.count(cfd) > 0) { + curr_log_num = cfd_to_current_log_[cfd]; + } + // If the active memtable contains records added by replaying an + // earlier WAL, then we need to seal the memtable, add it to the + // immutable memtable list and create a new active memtable. + if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 || + curr_log_num != log_number)) { + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + MemTable* new_mem = + cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch); + cfd->mem()->SetNextLogNumber(log_number); + cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free); + new_mem->Ref(); + cfd->SetMemtable(new_mem); + } + } + bool has_valid_writes = false; + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), + nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/, + true, log_number, this, false /* concurrent_memtable_writes */, + next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); + } // If column family was not found, it might mean that the WAL write // batch references to the column family that was dropped after the // insert. We don't want to fail the whole write batch in that case -- @@ -192,36 +264,43 @@ Status DBImplSecondary::RecoverLogFiles( // That's why we set ignore missing column families to true // passing null flush_scheduler will disable memtable flushing which is // needed for secondary instances - bool has_valid_writes = false; - status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), nullptr /* flush_scheduler */, - true, log_number, this, false /* concurrent_memtable_writes */, - next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); - if (!status.ok()) { + if (status.ok()) { + for (const auto id : column_family_ids) { + ColumnFamilyData* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(id); + if (cfd == nullptr) { + continue; + } + std::unordered_map::iterator iter = + cfd_to_current_log_.find(cfd); + if (iter == cfd_to_current_log_.end()) { + cfd_to_current_log_.insert({cfd, log_number}); + } else if (log_number > iter->second) { + iter->second = log_number; + } + } + auto last_sequence = *next_sequence - 1; + if ((*next_sequence != kMaxSequenceNumber) && + (versions_->LastSequence() <= last_sequence)) { + versions_->SetLastAllocatedSequence(last_sequence); + versions_->SetLastPublishedSequence(last_sequence); + versions_->SetLastSequence(last_sequence); + } + } else { // We are treating this as a failure while reading since we read valid // blocks that do not form coherent data reader->GetReporter()->Corruption(record.size(), status); - continue; } } - if (!status.ok()) { return status; } - - auto last_sequence = *next_sequence - 1; - if ((*next_sequence != kMaxSequenceNumber) && - (versions_->LastSequence() <= last_sequence)) { - versions_->SetLastAllocatedSequence(last_sequence); - versions_->SetLastPublishedSequence(last_sequence); - versions_->SetLastSequence(last_sequence); - } } // remove logreaders from map after successfully recovering the WAL if (log_readers_.size() > 1) { - auto eraseIter = log_readers_.begin(); - std::advance(eraseIter, log_readers_.size() - 1); - log_readers_.erase(log_readers_.begin(), eraseIter); + auto erase_iter = log_readers_.begin(); + std::advance(erase_iter, log_readers_.size() - 1); + log_readers_.erase(log_readers_.begin(), erase_iter); } return status; } @@ -373,21 +452,89 @@ Status DBImplSecondary::NewIterators( return Status::OK(); } +Status DBImplSecondary::CheckConsistency() { + mutex_.AssertHeld(); + Status s = DBImpl::CheckConsistency(); + // If DBImpl::CheckConsistency() which is stricter returns success, then we + // do not need to give a second chance. + if (s.ok()) { + return s; + } + // It's possible that DBImpl::CheckConssitency() can fail because the primary + // may have removed certain files, causing the GetFileSize(name) call to + // fail and returning a PathNotFound. In this case, we take a best-effort + // approach and just proceed. + TEST_SYNC_POINT_CALLBACK( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s); + std::vector metadata; + versions_->GetLiveFilesMetaData(&metadata); + + std::string corruption_messages; + for (const auto& md : metadata) { + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; + + uint64_t fsize = 0; + s = env_->GetFileSize(file_path, &fsize); + if (!s.ok() && + (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() || + s.IsPathNotFound())) { + s = Status::OK(); + } + if (!s.ok()) { + corruption_messages += + "Can't access " + md.name + ": " + s.ToString() + "\n"; + } + } + return corruption_messages.empty() ? Status::OK() + : Status::Corruption(corruption_messages); +} + Status DBImplSecondary::TryCatchUpWithPrimary() { assert(versions_.get() != nullptr); assert(manifest_reader_.get() != nullptr); Status s; + // read the manifest and apply new changes to the secondary instance std::unordered_set cfds_changed; + JobContext job_context(0, true /*create_superversion*/); InstrumentedMutexLock lock_guard(&mutex_); s = static_cast(versions_.get()) ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed); + + ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64, + static_cast(versions_->LastSequence())); + for (ColumnFamilyData* cfd : cfds_changed) { + if (cfd->IsDropped()) { + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n", + cfd->GetName().c_str()); + continue; + } + VersionStorageInfo::LevelSummaryStorage tmp; + ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Level summary: %s\n", + cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp)); + } + + // list wal_dir to discover new WALs and apply new changes to the secondary + // instance + if (s.ok()) { + s = FindAndRecoverLogFiles(&cfds_changed, &job_context); + } + if (s.IsPathNotFound()) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Secondary tries to read WAL, but WAL file(s) have already " + "been purged by primary."); + s = Status::OK(); + } if (s.ok()) { - SuperVersionContext sv_context(true /* create_superversion */); for (auto cfd : cfds_changed) { - sv_context.NewSuperVersion(); + cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(), + &job_context.memtables_to_free); + auto& sv_context = job_context.superversion_contexts.back(); cfd->InstallSuperVersion(&sv_context, &mutex_); + sv_context.NewSuperVersion(); } - sv_context.Clean(); + job_context.Clean(); } return s; } @@ -425,39 +572,14 @@ Status DB::OpenAsSecondary( } DBOptions tmp_opts(db_options); + Status s; if (nullptr == tmp_opts.info_log) { - Env* env = tmp_opts.env; - assert(env != nullptr); - std::string secondary_abs_path; - env->GetAbsolutePath(secondary_path, &secondary_abs_path); - std::string fname = InfoLogFileName(secondary_path, secondary_abs_path, - tmp_opts.db_log_dir); - - env->CreateDirIfMissing(secondary_path); - if (tmp_opts.log_file_time_to_roll > 0 || tmp_opts.max_log_file_size > 0) { - AutoRollLogger* result = new AutoRollLogger( - env, secondary_path, tmp_opts.db_log_dir, tmp_opts.max_log_file_size, - tmp_opts.log_file_time_to_roll, tmp_opts.info_log_level); - Status s = result->GetStatus(); - if (!s.ok()) { - delete result; - } else { - tmp_opts.info_log.reset(result); - } - } - if (nullptr == tmp_opts.info_log) { - env->RenameFile( - fname, OldInfoLogFileName(secondary_path, env->NowMicros(), - secondary_abs_path, tmp_opts.db_log_dir)); - Status s = env->NewLogger(fname, &(tmp_opts.info_log)); - if (tmp_opts.info_log != nullptr) { - tmp_opts.info_log->SetInfoLogLevel(tmp_opts.info_log_level); - } + s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log); + if (!s.ok()) { + tmp_opts.info_log = nullptr; } } - assert(tmp_opts.info_log != nullptr); - handles->clear(); DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname); impl->versions_.reset(new ReactiveVersionSet( @@ -466,8 +588,10 @@ Status DB::OpenAsSecondary( &impl->write_controller_)); impl->column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet())); + impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_); + impl->mutex_.Lock(); - Status s = impl->Recover(column_families, true, false, false); + s = impl->Recover(column_families, true, false, false); if (s.ok()) { for (auto cf : column_families) { auto cfd = diff --git a/db/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h similarity index 58% rename from db/db_impl_secondary.h rename to db/db_impl/db_impl_secondary.h index 64c81432848..ca853e25802 100644 --- a/db/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -9,10 +9,11 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { +// A wrapper class to hold log reader, log reporter, log status. class LogReaderContainer { public: LogReaderContainer() @@ -62,11 +63,19 @@ class LogReaderContainer { }; }; +// The secondary instance shares access to the storage as the primary. +// The secondary is able to read and replay changes described in both the +// MANIFEST and the WAL files without coordination with the primary. +// The secondary instance can be opened using `DB::OpenAsSecondary`. After +// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best +// effort attempts to catch up with the primary. class DBImplSecondary : public DBImpl { public: DBImplSecondary(const DBOptions& options, const std::string& dbname); ~DBImplSecondary() override; + // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_ + // and log_readers_ to facilitate future operations. Status Recover(const std::vector& column_families, bool read_only, bool error_if_log_file_exist, bool error_if_data_exists_in_logs) override; @@ -96,40 +105,40 @@ class DBImplSecondary : public DBImpl { Status Put(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, const Slice& /*value*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::Merge; Status Merge(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, const Slice& /*value*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::Delete; Status Delete(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::SingleDelete; Status SingleDelete(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::CompactRange; Status CompactRange(const CompactRangeOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice* /*begin*/, const Slice* /*end*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::CompactFiles; @@ -140,32 +149,32 @@ class DBImplSecondary : public DBImpl { const int /*output_level*/, const int /*output_path_id*/ = -1, std::vector* const /*output_file_names*/ = nullptr, CompactionJobInfo* /*compaction_job_info*/ = nullptr) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } Status DisableFileDeletions() override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } Status EnableFileDeletions(bool /*force*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } Status GetLiveFiles(std::vector&, uint64_t* /*manifest_file_size*/, bool /*flush_memtable*/ = true) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::Flush; Status Flush(const FlushOptions& /*options*/, ColumnFamilyHandle* /*column_family*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DBImpl::SyncWAL; Status SyncWAL() override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } using DB::IngestExternalFile; @@ -173,7 +182,7 @@ class DBImplSecondary : public DBImpl { ColumnFamilyHandle* /*column_family*/, const std::vector& /*external_files*/, const IngestExternalFileOptions& /*ingestion_options*/) override { - return Status::NotSupported("Not supported operation in read only mode."); + return Status::NotSupported("Not supported operation in secondary mode."); } // Try to catch up with the primary by reading as much as possible from the @@ -182,9 +191,84 @@ class DBImplSecondary : public DBImpl { // method can take long time due to all the I/O and CPU costs. Status TryCatchUpWithPrimary() override; + + // Try to find log reader using log_number from log_readers_ map, initialize + // if it doesn't exist Status MaybeInitLogReader(uint64_t log_number, log::FragmentBufferedReader** log_reader); + // Check if all live files exist on file system and that their file sizes + // matche to the in-memory records. It is possible that some live files may + // have been deleted by the primary. In this case, CheckConsistency() does + // not flag the missing file as inconsistency. + Status CheckConsistency() override; + + protected: + // ColumnFamilyCollector is a write batch handler which does nothing + // except recording unique column family IDs + class ColumnFamilyCollector : public WriteBatch::Handler { + std::unordered_set column_family_ids_; + + Status AddColumnFamilyId(uint32_t column_family_id) { + if (column_family_ids_.find(column_family_id) == + column_family_ids_.end()) { + column_family_ids_.insert(column_family_id); + } + return Status::OK(); + } + + public: + explicit ColumnFamilyCollector() {} + + ~ColumnFamilyCollector() override {} + + Status PutCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MergeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + const std::unordered_set& column_families() const { + return column_family_ids_; + } + }; + + Status CollectColumnFamilyIdsFromWriteBatch( + const WriteBatch& batch, std::vector* column_family_ids) { + assert(column_family_ids != nullptr); + column_family_ids->clear(); + ColumnFamilyCollector handler; + Status s = batch.Iterate(&handler); + if (s.ok()) { + for (const auto& cf : handler.column_families()) { + column_family_ids->push_back(cf); + } + } + return s; + } + private: friend class DB; @@ -194,17 +278,27 @@ class DBImplSecondary : public DBImpl { using DBImpl::Recover; + Status FindAndRecoverLogFiles( + std::unordered_set* cfds_changed, + JobContext* job_context); + Status FindNewLogNumbers(std::vector* logs); + // After manifest recovery, replay WALs and refresh log_readers_ if necessary + // REQUIRES: log_numbers are sorted in ascending order Status RecoverLogFiles(const std::vector& log_numbers, SequenceNumber* next_sequence, - bool read_only) override; + std::unordered_set* cfds_changed, + JobContext* job_context); std::unique_ptr manifest_reader_; std::unique_ptr manifest_reporter_; std::unique_ptr manifest_reader_status_; - // cache log readers for each log number, used for continue WAL replay + // Cache log readers for each log number, used for continue WAL replay // after recovery std::map> log_readers_; + + // Current WAL number replayed for each column family. + std::unordered_map cfd_to_current_log_; }; } // namespace rocksdb diff --git a/db/db_impl_write.cc b/db/db_impl/db_impl_write.cc similarity index 80% rename from db/db_impl_write.cc rename to db/db_impl/db_impl_write.cc index 3edec9ac521..34193cabc95 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -6,17 +6,14 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include +#include #include "db/error_handler.h" #include "db/event_helpers.h" #include "monitoring/perf_context_imp.h" #include "options/options_helper.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { // Convenience methods @@ -94,6 +91,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, return Status::NotSupported( "pipelined_writes is not compatible with seq_per_batch"); } + if (immutable_db_options_.unordered_write && + immutable_db_options_.enable_pipelined_write) { + return Status::NotSupported( + "pipelined_writes is not compatible with unordered_write"); + } // Otherwise IsLatestPersistentState optimization does not make sense assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) || disable_memtable); @@ -107,8 +109,39 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } if (two_write_queues_ && disable_memtable) { - return WriteImplWALOnly(write_options, my_batch, callback, log_used, - log_ref, seq_used, batch_cnt, pre_release_callback); + AssignOrder assign_order = + seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder; + // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and + // they don't consume sequence. + return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch, + callback, log_used, log_ref, seq_used, batch_cnt, + pre_release_callback, assign_order, + kDontPublishLastSeq, disable_memtable); + } + + if (immutable_db_options_.unordered_write) { + const size_t sub_batch_cnt = batch_cnt != 0 + ? batch_cnt + // every key is a sub-batch consuming a seq + : WriteBatchInternal::Count(my_batch); + uint64_t seq; + // Use a write thread to i) optimize for WAL write, ii) publish last + // sequence in in increasing order, iii) call pre_release_callback serially + status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback, + log_used, log_ref, &seq, sub_batch_cnt, + pre_release_callback, kDoAssignOrder, + kDoPublishLastSeq, disable_memtable); + if (!status.ok()) { + return status; + } + if (seq_used) { + *seq_used = seq; + } + if (!disable_memtable) { + status = UnorderedWriteMemtable(write_options, my_batch, callback, + log_ref, seq, sub_batch_cnt); + } + return status; } if (immutable_db_options_.enable_pipelined_write) { @@ -138,8 +171,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, versions_->GetColumnFamilySet()); w.status = WriteBatchInternal::InsertInto( &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, 0 /*log_number*/, this, - true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt); + true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt, + batch_per_txn_, write_options.memtable_insert_hint_per_batch); PERF_TIMER_START(write_pre_and_post_process_time); } @@ -178,9 +213,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, WriteThread::WriteGroup write_group; bool in_parallel_group = false; uint64_t last_sequence = kMaxSequenceNumber; - if (!two_write_queues_) { - last_sequence = versions_->LastSequence(); - } mutex_.Lock(); @@ -195,6 +227,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, PERF_TIMER_STOP(write_pre_and_post_process_time); status = PreprocessWrite(write_options, &need_log_sync, &write_context); + if (!two_write_queues_) { + // Assign it after ::PreprocessWrite since the sequence might advance + // inside it by WriteRecoverableState + last_sequence = versions_->LastSequence(); + } PERF_TIMER_START(write_pre_and_post_process_time); } @@ -228,6 +265,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, size_t total_count = 0; size_t valid_batches = 0; size_t total_byte_size = 0; + size_t pre_release_callback_cnt = 0; for (auto* writer : write_group) { if (writer->CheckCallback(this)) { valid_batches += writer->batch_cnt; @@ -235,9 +273,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, total_count += WriteBatchInternal::Count(writer->batch); parallel = parallel && !writer->batch->HasMerge(); } - total_byte_size = WriteBatchInternal::AppendedByteSize( total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } } } // Note about seq_per_batch_: either disableWAL is set for the entire write @@ -254,18 +294,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // We're optimistic, updating the stats before we successfully // commit. That lets us release our leader status early. auto stats = default_cf_internal_stats_; - stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count, + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count, concurrent_update); RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); - stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size, + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, concurrent_update); RecordTick(stats_, BYTES_WRITTEN, total_byte_size); - stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1, concurrent_update); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); RecordTick(stats_, WRITE_DONE_BY_SELF); auto write_done_by_other = write_group.size - 1; if (write_done_by_other > 0) { - stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, write_done_by_other, - concurrent_update); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); } RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); @@ -301,6 +342,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // PreReleaseCallback is called after WAL write and before memtable write if (status.ok()) { SequenceNumber next_sequence = current_sequence; + size_t index = 0; // Note: the logic for advancing seq here must be consistent with the // logic in WriteBatchInternal::InsertInto(write_group...) as well as // with WriteBatchInternal::InsertInto(write_batch...) that is called on @@ -312,7 +354,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, writer->sequence = next_sequence; if (writer->pre_release_callback) { Status ws = writer->pre_release_callback->Callback( - writer->sequence, disable_memtable, writer->log_used); + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); if (!ws.ok()) { status = ws; break; @@ -334,7 +377,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // w.sequence will be set inside InsertInto w.status = WriteBatchInternal::InsertInto( write_group, current_sequence, column_family_memtables_.get(), - &flush_scheduler_, write_options.ignore_missing_column_families, + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*recovery_log_number*/, this, parallel, seq_per_batch_, batch_per_txn_); } else { @@ -350,9 +394,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, assert(w.sequence == current_sequence); w.status = WriteBatchInternal::InsertInto( &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, seq_per_batch_, - w.batch_cnt, batch_per_txn_); + w.batch_cnt, batch_per_txn_, + write_options.memtable_insert_hint_per_batch); } } if (seq_used != nullptr) { @@ -459,9 +505,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } auto stats = default_cf_internal_stats_; - stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count); + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); - stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size); + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size); RecordTick(stats_, BYTES_WRITTEN, total_byte_size); RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); @@ -469,10 +515,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, if (w.status.ok() && !write_options.disableWAL) { PERF_TIMER_GUARD(write_wal_time); - stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1); RecordTick(stats_, WRITE_DONE_BY_SELF, 1); if (wal_write_group.size > 1) { - stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, wal_write_group.size - 1); RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1); } @@ -504,9 +550,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } else { memtable_write_group.status = WriteBatchInternal::InsertInto( memtable_write_group, w.sequence, column_family_memtables_.get(), - &flush_scheduler_, write_options.ignore_missing_column_families, - 0 /*log_number*/, this, false /*concurrent_memtable_writes*/, - seq_per_batch_, batch_per_txn_); + &flush_scheduler_, &trim_history_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_); versions_->SetLastSequence(memtable_write_group.last_sequence); write_thread_.ExitAsMemTableWriter(&w, memtable_write_group); } @@ -518,8 +564,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, versions_->GetColumnFamilySet()); w.status = WriteBatchInternal::InsertInto( &w, w.sequence, &column_family_memtables, &flush_scheduler_, - write_options.ignore_missing_column_families, 0 /*log_number*/, this, - true /*concurrent_memtable_writes*/); + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); if (write_thread_.CompleteParallelMemTableWriter(&w)) { MemTableInsertStatusCheck(w.status); versions_->SetLastSequence(w.write_group->last_sequence); @@ -534,23 +582,72 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, return w.FinalStatus(); } +Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, + WriteBatch* my_batch, + WriteCallback* callback, uint64_t log_ref, + SequenceNumber seq, + const size_t sub_batch_cnt) { + PERF_TIMER_GUARD(write_pre_and_post_process_time); + StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); + + WriteThread::Writer w(write_options, my_batch, callback, log_ref, + false /*disable_memtable*/); + + if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) { + w.sequence = seq; + size_t total_count = WriteBatchInternal::Count(my_batch); + InternalStats* stats = default_cf_internal_stats_; + stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count); + RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); + + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + w.status = WriteBatchInternal::InsertInto( + &w, w.sequence, &column_family_memtables, &flush_scheduler_, + &trim_history_scheduler_, write_options.ignore_missing_column_families, + 0 /*log_number*/, this, true /*concurrent_memtable_writes*/, + seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/, + write_options.memtable_insert_hint_per_batch); + + WriteStatusCheck(w.status); + if (write_options.disableWAL) { + has_unpersisted_data_.store(true, std::memory_order_relaxed); + } + } + + size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1; + if (pending_cnt == 0) { + // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex + // before notify ensures that cv is in waiting state when it is notified + // thus not missing the update to pending_memtable_writes_ even though it is + // not modified under the mutex. + std::lock_guard lck(switch_mutex_); + switch_cv_.notify_all(); + } + + if (!w.FinalStatus().ok()) { + return w.FinalStatus(); + } + return Status::OK(); +} + // The 2nd write queue. If enabled it will be used only for WAL-only writes. // This is the only queue that updates LastPublishedSequence which is only // applicable in a two-queue setting. -Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, - WriteBatch* my_batch, WriteCallback* callback, - uint64_t* log_used, uint64_t log_ref, - uint64_t* seq_used, size_t batch_cnt, - PreReleaseCallback* pre_release_callback) { +Status DBImpl::WriteImplWALOnly( + WriteThread* write_thread, const WriteOptions& write_options, + WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used, + const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt, + PreReleaseCallback* pre_release_callback, const AssignOrder assign_order, + const PublishLastSeq publish_last_seq, const bool disable_memtable) { Status status; PERF_TIMER_GUARD(write_pre_and_post_process_time); WriteThread::Writer w(write_options, my_batch, callback, log_ref, - true /* disable_memtable */, batch_cnt, - pre_release_callback); + disable_memtable, sub_batch_cnt, pre_release_callback); RecordTick(stats_, WRITE_WITH_WAL); StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE); - nonmem_write_thread_.JoinBatchGroup(&w); + write_thread->JoinBatchGroup(&w); assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER); if (w.state == WriteThread::STATE_COMPLETED) { if (log_used != nullptr) { @@ -563,17 +660,45 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, } // else we are the leader of the write batch group assert(w.state == WriteThread::STATE_GROUP_LEADER); + + if (publish_last_seq == kDoPublishLastSeq) { + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + WriteContext write_context; + if (error_handler_.IsDBStopped()) { + status = error_handler_.GetBGError(); + } + // TODO(myabandeh): Make preliminary checks thread-safe so we could do them + // without paying the cost of obtaining the mutex. + if (status.ok()) { + InstrumentedMutexLock l(&mutex_); + bool need_log_sync = false; + status = PreprocessWrite(write_options, &need_log_sync, &write_context); + WriteStatusCheck(status); + } + if (!status.ok()) { + WriteThread::WriteGroup write_group; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } + } + WriteThread::WriteGroup write_group; uint64_t last_sequence; - nonmem_write_thread_.EnterAsBatchGroupLeader(&w, &write_group); + write_thread->EnterAsBatchGroupLeader(&w, &write_group); // Note: no need to update last_batch_group_size_ here since the batch writes // to WAL only + size_t pre_release_callback_cnt = 0; size_t total_byte_size = 0; for (auto* writer : write_group) { if (writer->CheckCallback(this)) { total_byte_size = WriteBatchInternal::AppendedByteSize( total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + if (writer->pre_release_callback) { + pre_release_callback_cnt++; + } } } @@ -583,15 +708,16 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, // We're optimistic, updating the stats before we successfully // commit. That lets us release our leader status early. auto stats = default_cf_internal_stats_; - stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size, + stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size, concurrent_update); RecordTick(stats_, BYTES_WRITTEN, total_byte_size); - stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1, concurrent_update); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1, + concurrent_update); RecordTick(stats_, WRITE_DONE_BY_SELF); auto write_done_by_other = write_group.size - 1; if (write_done_by_other > 0) { - stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, write_done_by_other, - concurrent_update); + stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther, + write_done_by_other, concurrent_update); RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other); } RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size); @@ -602,11 +728,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, // LastAllocatedSequence is increased inside WriteToWAL under // wal_write_mutex_ to ensure ordered events in WAL size_t seq_inc = 0 /* total_count */; - if (seq_per_batch_) { + if (assign_order == kDoAssignOrder) { size_t total_batch_cnt = 0; for (auto* writer : write_group) { - assert(writer->batch_cnt); - total_batch_cnt += writer->batch_cnt; + assert(writer->batch_cnt || !seq_per_batch_); + if (!writer->CallbackFailed()) { + total_batch_cnt += writer->batch_cnt; + } } seq_inc = total_batch_cnt; } @@ -617,16 +745,21 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, // Otherwise we inc seq number to do solely the seq allocation last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc); } + + size_t memtable_write_cnt = 0; auto curr_seq = last_sequence + 1; for (auto* writer : write_group) { if (writer->CallbackFailed()) { continue; } writer->sequence = curr_seq; - if (seq_per_batch_) { - assert(writer->batch_cnt); + if (assign_order == kDoAssignOrder) { + assert(writer->batch_cnt || !seq_per_batch_); curr_seq += writer->batch_cnt; } + if (!writer->disable_memtable) { + memtable_write_cnt++; + } // else seq advances only by memtable writes } if (status.ok() && write_options.sync) { @@ -645,12 +778,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, WriteStatusCheck(status); } if (status.ok()) { + size_t index = 0; for (auto* writer : write_group) { if (!writer->CallbackFailed() && writer->pre_release_callback) { assert(writer->sequence != kMaxSequenceNumber); - const bool DISABLE_MEMTABLE = true; Status ws = writer->pre_release_callback->Callback( - writer->sequence, DISABLE_MEMTABLE, writer->log_used); + writer->sequence, disable_memtable, writer->log_used, index++, + pre_release_callback_cnt); if (!ws.ok()) { status = ws; break; @@ -658,7 +792,15 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options, } } } - nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status); + if (publish_last_seq == kDoPublishLastSeq) { + versions_->SetLastSequence(last_sequence + seq_inc); + // Currently we only use kDoPublishLastSeq in unordered_write + assert(immutable_db_options_.unordered_write); + } + if (immutable_db_options_.unordered_write && status.ok()) { + pending_memtable_writes_ += memtable_write_cnt; + } + write_thread->ExitAsBatchGroupLeader(write_group, status); if (status.ok()) { status = w.FinalStatus(); } @@ -710,6 +852,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); if (UNLIKELY(status.ok() && !single_column_family_mode_ && total_log_size_ > GetMaxTotalWalSize())) { + WaitForPendingWrites(); status = SwitchWAL(write_context); } @@ -719,10 +862,16 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // thread is writing to another DB with the same write buffer, they may also // be flushed. We may end up with flushing much more DBs than needed. It's // suboptimal but still correct. + WaitForPendingWrites(); status = HandleWriteBufferFull(write_context); } + if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { + status = TrimMemtableHistory(write_context); + } + if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + WaitForPendingWrites(); status = ScheduleFlushes(write_context); } @@ -900,12 +1049,12 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group, if (status.ok()) { auto stats = default_cf_internal_stats_; if (need_log_sync) { - stats->AddDBStats(InternalStats::WAL_FILE_SYNCED, 1); + stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1); RecordTick(stats_, WAL_FILE_SYNCED); } - stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size); + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size); RecordTick(stats_, WAL_FILE_BYTES, log_size); - stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal); + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal); RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); } return status; @@ -951,9 +1100,10 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group, if (status.ok()) { const bool concurrent = true; auto stats = default_cf_internal_stats_; - stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size, concurrent); + stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size, + concurrent); RecordTick(stats_, WAL_FILE_BYTES, log_size); - stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal, + stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal, concurrent); RecordTick(stats_, WRITE_WITH_WAL, write_with_wal); } @@ -977,9 +1127,9 @@ Status DBImpl::WriteRecoverableState() { WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1); auto status = WriteBatchInternal::InsertInto( &cached_recoverable_state_, column_family_memtables_.get(), - &flush_scheduler_, true, 0 /*recovery_log_number*/, this, - false /* concurrent_memtable_writes */, &next_seq, &dont_care_bool, - seq_per_batch_); + &flush_scheduler_, &trim_history_scheduler_, true, + 0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */, + &next_seq, &dont_care_bool, seq_per_batch_); auto last_seq = next_seq - 1; if (two_write_queues_) { versions_->FetchAddLastAllocatedSequence(last_seq - seq); @@ -994,8 +1144,12 @@ Status DBImpl::WriteRecoverableState() { for (uint64_t sub_batch_seq = seq + 1; sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) { uint64_t const no_log_num = 0; + // Unlock it since the callback might end up locking mutex. e.g., + // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB + mutex_.Unlock(); status = recoverable_state_pre_release_callback_->Callback( - sub_batch_seq, !DISABLE_MEMTABLE, no_log_num); + sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1); + mutex_.Lock(); } } if (status.ok()) { @@ -1090,7 +1244,13 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) { cfds.push_back(cfd); } } + MaybeFlushStatsCF(&cfds); + } + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); } + for (const auto cfd : cfds) { cfd->Ref(); status = SwitchMemtable(cfd, write_context); @@ -1099,6 +1259,10 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) { break; } } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + if (status.ok()) { if (immutable_db_options_.atomic_flush) { AssignAtomicFlushSeq(cfds); @@ -1156,8 +1320,13 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { if (cfd_picked != nullptr) { cfds.push_back(cfd_picked); } + MaybeFlushStatsCF(&cfds); } + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } for (const auto cfd : cfds) { if (cfd->mem()->IsEmpty()) { continue; @@ -1169,6 +1338,10 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) { break; } } + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + if (status.ok()) { if (immutable_db_options_.atomic_flush) { AssignAtomicFlushSeq(cfds); @@ -1249,8 +1422,8 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, } assert(!delayed || !write_options.no_slowdown); if (delayed) { - default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS, - time_delayed); + default_cf_internal_stats_->AddDBStats( + InternalStats::kIntStatsWriteStallMicros, time_delayed); RecordTick(stats_, STALL_MICROS, time_delayed); } @@ -1299,6 +1472,65 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, return Status::OK(); } +void DBImpl::MaybeFlushStatsCF(autovector* cfds) { + assert(cfds != nullptr); + if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) { + ColumnFamilyData* cfd_stats = + versions_->GetColumnFamilySet()->GetColumnFamily( + kPersistentStatsColumnFamilyName); + if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) { + for (ColumnFamilyData* cfd : *cfds) { + if (cfd == cfd_stats) { + // stats CF already included in cfds + return; + } + } + // force flush stats CF when its log number is less than all other CF's + // log numbers + bool force_flush_stats_cf = true; + for (auto* loop_cfd : *versions_->GetColumnFamilySet()) { + if (loop_cfd == cfd_stats) { + continue; + } + if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) { + force_flush_stats_cf = false; + } + } + if (force_flush_stats_cf) { + cfds->push_back(cfd_stats); + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "Force flushing stats CF with automated flush " + "to avoid holding old logs"); + } + } + } +} + +Status DBImpl::TrimMemtableHistory(WriteContext* context) { + autovector cfds; + ColumnFamilyData* tmp_cfd; + while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) != + nullptr) { + cfds.push_back(tmp_cfd); + } + for (auto& cfd : cfds) { + autovector to_delete; + cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage()); + for (auto m : to_delete) { + delete m; + } + context->superversion_context.NewSuperVersion(); + assert(context->superversion_context.new_superversion.get() != nullptr); + cfd->InstallSuperVersion(&context->superversion_context, &mutex_); + + if (cfd->Unref()) { + delete cfd; + cfd = nullptr; + } + } + return Status::OK(); +} + Status DBImpl::ScheduleFlushes(WriteContext* context) { autovector cfds; if (immutable_db_options_.atomic_flush) { @@ -1312,8 +1544,14 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { cfds.push_back(tmp_cfd); } + MaybeFlushStatsCF(&cfds); } Status status; + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + for (auto& cfd : cfds) { if (!cfd->mem()->IsEmpty()) { status = SwitchMemtable(cfd, context); @@ -1326,6 +1564,11 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { break; } } + + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + if (status.ok()) { if (immutable_db_options_.atomic_flush) { AssignAtomicFlushSeq(cfds); @@ -1356,15 +1599,11 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue +// REQUIRES: this thread is currently at the front of the 2nd writer queue if +// two_write_queues_ is true (This is to simplify the reasoning.) Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { mutex_.AssertHeld(); WriteThread::Writer nonmem_w; - if (two_write_queues_) { - // SwitchMemtable is a rare event. To simply the reasoning, we make sure - // that there is no concurrent thread writing to WAL. - nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); - } - std::unique_ptr lfile; log::Writer* new_log = nullptr; MemTable* new_mem = nullptr; @@ -1376,16 +1615,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { return s; } - // In case of pipelined write is enabled, wait for all pending memtable - // writers. - if (immutable_db_options_.enable_pipelined_write) { - // Memtable writers may call DB::Get in case max_successive_merges > 0, - // which may lock mutex. Unlocking mutex here to avoid deadlock. - mutex_.Unlock(); - write_thread_.WaitForMemTableWriters(); - mutex_.Lock(); - } - // Attempt to switch to a new memtable and trigger flush of old. // Do this without holding the dbmutex lock. assert(versions_->prev_log_number() == 0); @@ -1481,10 +1710,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); // Read back bg_error in order to get the right severity s = error_handler_.GetBGError(); - - if (two_write_queues_) { - nonmem_write_thread_.ExitUnbatched(&nonmem_w); - } return s; } @@ -1515,9 +1740,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { NotifyOnMemTableSealed(cfd, memtable_info); mutex_.Lock(); #endif // ROCKSDB_LITE - if (two_write_queues_) { - nonmem_write_thread_.ExitUnbatched(&nonmem_w); - } return s; } @@ -1547,14 +1769,30 @@ size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { // can call if they wish Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - // Pre-allocate size of write batch conservatively. - // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, - // and we allocate 11 extra bytes for key length, as well as value length. - WriteBatch batch(key.size() + value.size() + 24); + if (nullptr == opt.timestamp) { + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); + Status s = batch.Put(column_family, key, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); + } + const Slice* ts = opt.timestamp; + assert(nullptr != ts); + size_t ts_sz = ts->size(); + WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0, + ts_sz); Status s = batch.Put(column_family, key, value); if (!s.ok()) { return s; } + s = batch.AssignTimestamp(*ts); + if (!s.ok()) { + return s; + } return Write(opt, &batch); } diff --git a/db/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc similarity index 65% rename from db/db_secondary_test.cc rename to db/db_impl/db_secondary_test.cc index 47daf9fd8cc..6caff005eb4 100644 --- a/db/db_secondary_test.cc +++ b/db/db_impl/db_secondary_test.cc @@ -7,11 +7,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl_secondary.h" +#include "db/db_impl/db_impl_secondary.h" #include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" namespace rocksdb { @@ -237,6 +237,17 @@ TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) { }; verify_db_func("foo_value2", "bar_value2"); + + ASSERT_OK(Put("foo", "new_foo_value")); + ASSERT_OK(Put("bar", "new_bar_value")); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value", "new_bar_value"); + + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "new_foo_value_1")); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db_func("new_foo_value_1", "new_bar_value"); } TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) { @@ -362,7 +373,7 @@ TEST_F(DBSecondaryTest, MissingTableFile) { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( - "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", + "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers", [&](void* arg) { Status s = *reinterpret_cast(arg); if (s.IsPathNotFound()) { @@ -513,6 +524,256 @@ TEST_F(DBSecondaryTest, SwitchManifest) { ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); range_scan_db(); } + +// Here, "Snapshot" refers to the version edits written by +// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after +// switching from the old one. +TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ASSERT_OK(Put("0", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::string value; + ReadOptions ropts; + ropts.verify_checksums = true; + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value0", value); + + Reopen(options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); +} + +TEST_F(DBSecondaryTest, SwitchWAL) { + const int kNumKeysPerMemtable = 1; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + const auto& verify_db = [](DB* db1, DB* db2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + std::unique_ptr it1(db1->NewIterator(read_opts)); + std::unique_ptr it2(db2->NewIterator(read_opts)); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + }; + for (int k = 0; k != 16; ++k) { + ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), db_secondary_); + } +} + +TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) { + const int kNumKeysPerMemtable = 1; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:ContextCleanedUp", + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}}); + SyncPoint::GetInstance()->EnableProcessing(); + const std::string kCFName1 = "pikachu"; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + CreateAndReopenWithCF({kCFName1}, options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondaryWithColumnFamilies({kCFName1}, options1); + ASSERT_EQ(2, handles_secondary_.size()); + + const auto& verify_db = [](DB* db1, + const std::vector& handles1, + DB* db2, + const std::vector& handles2) { + ASSERT_NE(nullptr, db1); + ASSERT_NE(nullptr, db2); + ReadOptions read_opts; + read_opts.verify_checksums = true; + ASSERT_EQ(handles1.size(), handles2.size()); + for (size_t i = 0; i != handles1.size(); ++i) { + std::unique_ptr it1(db1->NewIterator(read_opts, handles1[i])); + std::unique_ptr it2(db2->NewIterator(read_opts, handles2[i])); + it1->SeekToFirst(); + it2->SeekToFirst(); + for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) { + ASSERT_EQ(it1->key(), it2->key()); + ASSERT_EQ(it1->value(), it2->value()); + } + ASSERT_FALSE(it1->Valid()); + ASSERT_FALSE(it2->Valid()); + + for (it1->SeekToFirst(); it1->Valid(); it1->Next()) { + std::string value; + ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value)); + ASSERT_EQ(it1->value(), value); + } + for (it2->SeekToFirst(); it2->Valid(); it2->Next()) { + std::string value; + ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value)); + ASSERT_EQ(it2->value(), value); + } + } + }; + for (int k = 0; k != 8; ++k) { + ASSERT_OK( + Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + ASSERT_OK( + Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k))); + TEST_SYNC_POINT( + "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + verify_db(dbfull(), handles_, db_secondary_, handles_secondary_); + SyncPoint::GetInstance()->ClearTrace(); + } +} + +TEST_F(DBSecondaryTest, CatchUpAfterFlush) { + const int kNumKeysPerMemtable = 16; + Options options; + options.env = env_; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 2; + options.memtable_factory.reset( + new SpecialSkipListFactory(kNumKeysPerMemtable)); + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + WriteOptions write_opts; + WriteBatch wb; + wb.Put("key0", "value0"); + wb.Put("key1", "value1"); + ASSERT_OK(dbfull()->Write(write_opts, &wb)); + ReadOptions read_opts; + std::unique_ptr iter1(db_secondary_->NewIterator(read_opts)); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + iter1->Seek("key0"); + ASSERT_FALSE(iter1->Valid()); + iter1->Seek("key1"); + ASSERT_FALSE(iter1->Valid()); + std::unique_ptr iter2(db_secondary_->NewIterator(read_opts)); + iter2->Seek("key0"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value0", iter2->value()); + iter2->Seek("key1"); + ASSERT_TRUE(iter2->Valid()); + ASSERT_EQ("value1", iter2->value()); + + { + WriteBatch wb1; + wb1.Put("key0", "value01"); + wb1.Put("key1", "value11"); + ASSERT_OK(dbfull()->Write(write_opts, &wb1)); + } + + { + WriteBatch wb2; + wb2.Put("key0", "new_value0"); + wb2.Delete("key1"); + ASSERT_OK(dbfull()->Write(write_opts, &wb2)); + } + + ASSERT_OK(Flush()); + + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::unique_ptr iter3(db_secondary_->NewIterator(read_opts)); + // iter3 should not see value01 and value11 at all. + iter3->Seek("key0"); + ASSERT_TRUE(iter3->Valid()); + ASSERT_EQ("new_value0", iter3->value()); + iter3->Seek("key1"); + ASSERT_FALSE(iter3->Valid()); +} + +TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) { + bool called = false; + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) { + ASSERT_NE(nullptr, arg); + called = true; + auto* s = reinterpret_cast(arg); + ASSERT_NOK(*s); + }); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", + "DBImpl::CheckConsistency:BeforeGetFileSize"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("a", "value0")); + ASSERT_OK(Put("c", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("b", "value1")); + ASSERT_OK(Put("d", "value1")); + ASSERT_OK(Flush()); + port::Thread thread([this]() { + Options opts; + opts.env = env_; + opts.max_open_files = -1; + OpenSecondary(opts); + }); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + thread.join(); + ASSERT_TRUE(called); +} #endif //! ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc index 31050d20a29..7ab7e3337aa 100644 --- a/db/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -3,20 +3,16 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "db/db_info_dumper.h" -#include #include -#include #include +#include +#include #include +#include "file/filename.h" #include "rocksdb/env.h" -#include "util/filename.h" namespace rocksdb { diff --git a/db/db_iter.cc b/db/db_iter.cc index 43a56af78c7..b2675c520a6 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -16,6 +16,9 @@ #include "db/merge_context.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" +#include "file/filename.h" +#include "logging/logging.h" +#include "memory/arena.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -23,12 +26,9 @@ #include "rocksdb/options.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" -#include "util/arena.h" -#include "util/filename.h" -#include "util/logging.h" +#include "trace_replay/trace_replay.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/trace_replay.h" #include "util/user_comparator_wrapper.h" namespace rocksdb { @@ -46,318 +46,74 @@ static void DumpInternalIter(Iterator* iter) { } #endif -// Memtables and sstables that make the DB representation contain -// (userkey,seq,type) => uservalue entries. DBIter -// combines multiple entries for the same userkey found in the DB -// representation into a single entry while accounting for sequence -// numbers, deletion markers, overwrites, etc. -class DBIter final: public Iterator { - public: - // The following is grossly complicated. TODO: clean it up - // Which direction is the iterator currently moving? - // (1) When moving forward: - // (1a) if current_entry_is_merged_ = false, the internal iterator is - // positioned at the exact entry that yields this->key(), this->value() - // (1b) if current_entry_is_merged_ = true, the internal iterator is - // positioned immediately after the last entry that contributed to the - // current this->value(). That entry may or may not have key equal to - // this->key(). - // (2) When moving backwards, the internal iterator is positioned - // just before all entries whose user key == this->key(). - enum Direction { - kForward, - kReverse - }; - - // LocalStatistics contain Statistics counters that will be aggregated per - // each iterator instance and then will be sent to the global statistics when - // the iterator is destroyed. - // - // The purpose of this approach is to avoid perf regression happening - // when multiple threads bump the atomic counters from a DBIter::Next(). - struct LocalStatistics { - explicit LocalStatistics() { ResetCounters(); } - - void ResetCounters() { - next_count_ = 0; - next_found_count_ = 0; - prev_count_ = 0; - prev_found_count_ = 0; - bytes_read_ = 0; - skip_count_ = 0; - } - - void BumpGlobalStatistics(Statistics* global_statistics) { - RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_); - RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_); - RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_); - RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_); - RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_); - RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_); - PERF_COUNTER_ADD(iter_read_bytes, bytes_read_); - ResetCounters(); - } - - // Map to Tickers::NUMBER_DB_NEXT - uint64_t next_count_; - // Map to Tickers::NUMBER_DB_NEXT_FOUND - uint64_t next_found_count_; - // Map to Tickers::NUMBER_DB_PREV - uint64_t prev_count_; - // Map to Tickers::NUMBER_DB_PREV_FOUND - uint64_t prev_found_count_; - // Map to Tickers::ITER_BYTES_READ - uint64_t bytes_read_; - // Map to Tickers::NUMBER_ITER_SKIP - uint64_t skip_count_; - }; - - DBIter(Env* _env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, const Comparator* cmp, - InternalIterator* iter, SequenceNumber s, bool arena_mode, - uint64_t max_sequential_skip_in_iterations, - ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob) - : env_(_env), - logger_(cf_options.info_log), - user_comparator_(cmp), - merge_operator_(cf_options.merge_operator), - iter_(iter), - read_callback_(read_callback), - sequence_(s), - statistics_(cf_options.statistics), - num_internal_keys_skipped_(0), - iterate_lower_bound_(read_options.iterate_lower_bound), - iterate_upper_bound_(read_options.iterate_upper_bound), - direction_(kForward), - valid_(false), - current_entry_is_merged_(false), - prefix_same_as_start_(read_options.prefix_same_as_start), - pin_thru_lifetime_(read_options.pin_data), - total_order_seek_(read_options.total_order_seek), - allow_blob_(allow_blob), - is_blob_(false), - arena_mode_(arena_mode), - range_del_agg_(&cf_options.internal_comparator, s), - db_impl_(db_impl), - cfd_(cfd), - start_seqnum_(read_options.iter_start_seqnum) { - RecordTick(statistics_, NO_ITERATOR_CREATED); - prefix_extractor_ = mutable_cf_options.prefix_extractor.get(); - max_skip_ = max_sequential_skip_in_iterations; - max_skippable_internal_keys_ = read_options.max_skippable_internal_keys; - if (pin_thru_lifetime_) { - pinned_iters_mgr_.StartPinning(); - } - if (iter_.iter()) { - iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); - } - } - ~DBIter() override { - // Release pinned data if any - if (pinned_iters_mgr_.PinningEnabled()) { - pinned_iters_mgr_.ReleasePinnedData(); - } - RecordTick(statistics_, NO_ITERATOR_DELETED); - ResetInternalKeysSkippedCounter(); - local_stats_.BumpGlobalStatistics(statistics_); - iter_.DeleteIter(arena_mode_); - } - virtual void SetIter(InternalIterator* iter) { - assert(iter_.iter() == nullptr); - iter_.Set(iter); +DBIter::DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, + const Comparator* cmp, InternalIterator* iter, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, + ColumnFamilyData* cfd, bool allow_blob) + : prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + env_(_env), + logger_(cf_options.info_log), + user_comparator_(cmp), + merge_operator_(cf_options.merge_operator), + iter_(iter), + read_callback_(read_callback), + sequence_(s), + statistics_(cf_options.statistics), + num_internal_keys_skipped_(0), + iterate_lower_bound_(read_options.iterate_lower_bound), + iterate_upper_bound_(read_options.iterate_upper_bound), + direction_(kForward), + valid_(false), + current_entry_is_merged_(false), + is_key_seqnum_zero_(false), + prefix_same_as_start_(mutable_cf_options.prefix_extractor + ? read_options.prefix_same_as_start + : false), + pin_thru_lifetime_(read_options.pin_data), + total_order_seek_(read_options.total_order_seek), + allow_blob_(allow_blob), + is_blob_(false), + arena_mode_(arena_mode), + range_del_agg_(&cf_options.internal_comparator, s), + db_impl_(db_impl), + cfd_(cfd), + start_seqnum_(read_options.iter_start_seqnum) { + RecordTick(statistics_, NO_ITERATOR_CREATED); + max_skip_ = max_sequential_skip_in_iterations; + max_skippable_internal_keys_ = read_options.max_skippable_internal_keys; + if (pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + if (iter_.iter()) { iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); } - virtual ReadRangeDelAggregator* GetRangeDelAggregator() { - return &range_del_agg_; - } - - bool Valid() const override { return valid_; } - Slice key() const override { - assert(valid_); - if(start_seqnum_ > 0) { - return saved_key_.GetInternalKey(); - } else { - return saved_key_.GetUserKey(); - } - } - Slice value() const override { - assert(valid_); - if (current_entry_is_merged_) { - // If pinned_value_ is set then the result of merge operator is one of - // the merge operands and we should return it. - return pinned_value_.data() ? pinned_value_ : saved_value_; - } else if (direction_ == kReverse) { - return pinned_value_; - } else { - return iter_.value(); - } - } - Status status() const override { - if (status_.ok()) { - return iter_.status(); - } else { - assert(!valid_); - return status_; - } - } - bool IsBlob() const { - assert(valid_ && (allow_blob_ || !is_blob_)); - return is_blob_; - } - - Status GetProperty(std::string prop_name, std::string* prop) override { - if (prop == nullptr) { - return Status::InvalidArgument("prop is nullptr"); - } - if (prop_name == "rocksdb.iterator.super-version-number") { - // First try to pass the value returned from inner iterator. - return iter_.iter()->GetProperty(prop_name, prop); - } else if (prop_name == "rocksdb.iterator.is-key-pinned") { - if (valid_) { - *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0"; - } else { - *prop = "Iterator is not valid."; - } - return Status::OK(); - } else if (prop_name == "rocksdb.iterator.internal-key") { - *prop = saved_key_.GetUserKey().ToString(); - return Status::OK(); - } - return Status::InvalidArgument("Unidentified property."); - } - - inline void Next() final override; - inline void Prev() final override; - inline void Seek(const Slice& target) final override; - inline void SeekForPrev(const Slice& target) final override; - inline void SeekToFirst() final override; - inline void SeekToLast() final override; - Env* env() { return env_; } - void set_sequence(uint64_t s) { - sequence_ = s; - if (read_callback_) { - read_callback_->Refresh(s); - } - } - void set_valid(bool v) { valid_ = v; } - - private: - // For all methods in this block: - // PRE: iter_->Valid() && status_.ok() - // Return false if there was an error, and status() is non-ok, valid_ = false; - // in this case callers would usually stop what they were doing and return. - bool ReverseToForward(); - bool ReverseToBackward(); - bool FindValueForCurrentKey(); - bool FindValueForCurrentKeyUsingSeek(); - bool FindUserKeyBeforeSavedKey(); - inline bool FindNextUserEntry(bool skipping, bool prefix_check); - inline bool FindNextUserEntryInternal(bool skipping, bool prefix_check); - bool ParseKey(ParsedInternalKey* key); - bool MergeValuesNewToOld(); - - void PrevInternal(); - bool TooManyInternalKeysSkipped(bool increment = true); - inline bool IsVisible(SequenceNumber sequence); - - // CanReseekToSkip() returns whether the iterator can use the optimization - // where it reseek by sequence number to get the next key when there are too - // many versions. This is disabled for write unprepared because seeking to - // sequence number does not guarantee that it is visible. - inline bool CanReseekToSkip(); - - // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() - // is called - void TempPinData() { - if (!pin_thru_lifetime_) { - pinned_iters_mgr_.StartPinning(); - } - } - - // Release blocks pinned by TempPinData() - void ReleaseTempPinnedData() { - if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) { - pinned_iters_mgr_.ReleasePinnedData(); - } - } +} - inline void ClearSavedValue() { - if (saved_value_.capacity() > 1048576) { - std::string empty; - swap(empty, saved_value_); - } else { - saved_value_.clear(); - } +Status DBIter::GetProperty(std::string prop_name, std::string* prop) { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); } - - inline void ResetInternalKeysSkippedCounter() { - local_stats_.skip_count_ += num_internal_keys_skipped_; + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + return iter_.iter()->GetProperty(prop_name, prop); + } else if (prop_name == "rocksdb.iterator.is-key-pinned") { if (valid_) { - local_stats_.skip_count_--; + *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0"; + } else { + *prop = "Iterator is not valid."; } - num_internal_keys_skipped_ = 0; + return Status::OK(); + } else if (prop_name == "rocksdb.iterator.internal-key") { + *prop = saved_key_.GetUserKey().ToString(); + return Status::OK(); } + return Status::InvalidArgument("Unidentified property."); +} - const SliceTransform* prefix_extractor_; - Env* const env_; - Logger* logger_; - UserComparatorWrapper user_comparator_; - const MergeOperator* const merge_operator_; - IteratorWrapper iter_; - ReadCallback* read_callback_; - // Max visible sequence number. It is normally the snapshot seq unless we have - // uncommitted data in db as in WriteUnCommitted. - SequenceNumber sequence_; - - IterKey saved_key_; - // Reusable internal key data structure. This is only used inside one function - // and should not be used across functions. Reusing this object can reduce - // overhead of calling construction of the function if creating it each time. - ParsedInternalKey ikey_; - std::string saved_value_; - Slice pinned_value_; - // for prefix seek mode to support prev() - Statistics* statistics_; - uint64_t max_skip_; - uint64_t max_skippable_internal_keys_; - uint64_t num_internal_keys_skipped_; - const Slice* iterate_lower_bound_; - const Slice* iterate_upper_bound_; - - IterKey prefix_start_buf_; - - Status status_; - Slice prefix_start_key_; - Direction direction_; - bool valid_; - bool current_entry_is_merged_; - const bool prefix_same_as_start_; - // Means that we will pin all data blocks we read as long the Iterator - // is not deleted, will be true if ReadOptions::pin_data is true - const bool pin_thru_lifetime_; - const bool total_order_seek_; - bool allow_blob_; - bool is_blob_; - bool arena_mode_; - // List of operands for merge operator. - MergeContext merge_context_; - ReadRangeDelAggregator range_del_agg_; - LocalStatistics local_stats_; - PinnedIteratorsManager pinned_iters_mgr_; - DBImpl* db_impl_; - ColumnFamilyData* cfd_; - // for diff snapshots we want the lower bound on the seqnum; - // if this value > 0 iterator will return internal keys - SequenceNumber start_seqnum_; - - // No copying allowed - DBIter(const DBIter&); - void operator=(const DBIter&); -}; - -inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { +bool DBIter::ParseKey(ParsedInternalKey* ikey) { if (!ParseInternalKey(iter_.key(), ikey)) { status_ = Status::Corruption("corrupted internal key in DBIter"); valid_ = false; @@ -381,6 +137,7 @@ void DBIter::Next() { num_internal_keys_skipped_ = 0; bool ok = true; if (direction_ == kReverse) { + is_key_seqnum_zero_ = false; if (!ReverseToForward()) { ok = false; } @@ -397,9 +154,15 @@ void DBIter::Next() { local_stats_.next_count_++; if (ok && iter_.Valid()) { + Slice prefix; + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix = prefix_.GetUserKey(); + } FindNextUserEntry(true /* skipping the current user key */, - prefix_same_as_start_); + prefix_same_as_start_ ? &prefix : nullptr); } else { + is_key_seqnum_zero_ = false; valid_ = false; } if (statistics_ != nullptr && valid_) { @@ -408,7 +171,7 @@ void DBIter::Next() { } } -// PRE: saved_key_ has the current user key if skipping +// PRE: saved_key_ has the current user key if skipping_saved_key // POST: saved_key_ should have the next user key if valid_, // if the current entry is a result of merge // current_entry_is_merged_ => true @@ -418,17 +181,17 @@ void DBIter::Next() { // a delete marker or a sequence number higher than sequence_ // saved_key_ MUST have a proper user_key before calling this function // -// The prefix_check parameter controls whether we check the iterated -// keys against the prefix of the seeked key. Set to false when -// performing a seek without a key (e.g. SeekToFirst). Set to -// prefix_same_as_start_ for other iterations. -inline bool DBIter::FindNextUserEntry(bool skipping, bool prefix_check) { +// The prefix parameter, if not null, indicates that we need to iterator +// within the prefix, and the iterator needs to be made invalid, if no +// more entry for the prefix can be found. +bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) { PERF_TIMER_GUARD(find_next_user_entry_time); - return FindNextUserEntryInternal(skipping, prefix_check); + return FindNextUserEntryInternal(skipping_saved_key, prefix); } // Actual implementation of DBIter::FindNextUserEntry() -inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { +bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, + const Slice* prefix) { // Loop until we hit an acceptable entry to yield assert(iter_.Valid()); assert(status_.ok()); @@ -437,31 +200,46 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) // How many times in a row we have skipped an entry with user key less than // or equal to saved_key_. We could skip these entries either because - // sequence numbers were too high or because skipping = true. + // sequence numbers were too high or because skipping_saved_key = true. // What saved_key_ contains throughout this method: - // - if skipping : saved_key_ contains the key that we need to skip, + // - if skipping_saved_key : saved_key_ contains the key that we need + // to skip, // and we haven't seen any keys greater than that, // - if num_skipped > 0 : saved_key_ contains the key that we have skipped // num_skipped times, and we haven't seen any keys // greater than that, // - none of the above : saved_key_ can contain anything, it doesn't matter. uint64_t num_skipped = 0; + // For write unprepared, the target sequence number in reseek could be larger + // than the snapshot, and thus needs to be skipped again. This could result in + // an infinite loop of reseeks. To avoid that, we limit the number of reseeks + // to one. + bool reseek_done = false; is_blob_ = false; do { + // Will update is_key_seqnum_zero_ as soon as we parsed the current key + // but we need to save the previous value to be used in the loop. + bool is_prev_key_seqnum_zero = is_key_seqnum_zero_; if (!ParseKey(&ikey_)) { + is_key_seqnum_zero_ = false; return false; } - if (iterate_upper_bound_ != nullptr && + is_key_seqnum_zero_ = (ikey_.sequence == 0); + + assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() || + user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0); + if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() && user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) { break; } - if (prefix_extractor_ && prefix_check && - prefix_extractor_->Transform(ikey_.user_key) - .compare(prefix_start_key_) != 0) { + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && + prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) { + assert(prefix_same_as_start_); break; } @@ -470,12 +248,21 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) } if (IsVisible(ikey_.sequence)) { - if (skipping && user_comparator_.Compare(ikey_.user_key, - saved_key_.GetUserKey()) <= 0) { + // If the previous entry is of seqnum 0, the current entry will not + // possibly be skipped. This condition can potentially be relaxed to + // prev_key.seq <= ikey_.sequence. We are cautious because it will be more + // prone to bugs causing the same user key with the same sequence number. + if (!is_prev_key_seqnum_zero && skipping_saved_key && + user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <= + 0) { num_skipped++; // skip this entry PERF_COUNTER_ADD(internal_key_skipped_count, 1); } else { + assert(!skipping_saved_key || + user_comparator_.Compare(ikey_.user_key, + saved_key_.GetUserKey()) > 0); num_skipped = 0; + reseek_done = false; switch (ikey_.type) { case kTypeDeletion: case kTypeSingleDeletion: @@ -494,7 +281,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) saved_key_.SetUserKey( ikey_.user_key, !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); - skipping = true; + skipping_saved_key = true; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); } break; @@ -512,12 +299,12 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) return true; } else { // this key and all previous versions shouldn't be included, - // skipping + // skipping_saved_key saved_key_.SetUserKey( ikey_.user_key, !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); - skipping = true; + skipping_saved_key = true; } } else { saved_key_.SetUserKey( @@ -527,8 +314,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) ikey_, RangeDelPositioningMode::kForwardTraversal)) { // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. - skipping = true; + skipping_saved_key = true; num_skipped = 0; + reseek_done = false; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); } else if (ikey_.type == kTypeBlobIndex) { if (!allow_blob_) { @@ -557,8 +345,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) ikey_, RangeDelPositioningMode::kForwardTraversal)) { // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. - skipping = true; + skipping_saved_key = true; num_skipped = 0; + reseek_done = false; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); } else { // By now, we are sure the current ikey is going to yield a @@ -581,30 +370,40 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) // to seek to the target sequence number. int cmp = user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()); - if (cmp == 0 || (skipping && cmp <= 0)) { + if (cmp == 0 || (skipping_saved_key && cmp <= 0)) { num_skipped++; } else { saved_key_.SetUserKey( ikey_.user_key, !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); - skipping = false; + skipping_saved_key = false; num_skipped = 0; + reseek_done = false; } } // If we have sequentially iterated via numerous equal keys, then it's // better to seek so that we can avoid too many key comparisons. - if (num_skipped > max_skip_ && CanReseekToSkip()) { + // + // To avoid infinite loops, do not reseek if we have already attempted to + // reseek previously. + // + // TODO(lth): If we reseek to sequence number greater than ikey_.sequence, + // than it does not make sense to reseek as we would actually land further + // away from the desired key. There is opportunity for optimization here. + if (num_skipped > max_skip_ && !reseek_done) { + is_key_seqnum_zero_ = false; num_skipped = 0; + reseek_done = true; std::string last_key; - if (skipping) { + if (skipping_saved_key) { // We're looking for the next user-key but all we see are the same // user-key with decreasing sequence numbers. Fast forward to // sequence number 0 and type deletion (the smallest type). AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion)); - // Don't set skipping = false because we may still see more user-keys - // equal to saved_key_. + // Don't set skipping_saved_key = false because we may still see more + // user-keys equal to saved_key_. } else { // We saw multiple entries with this user key and sequence numbers // higher than sequence_. Fast forward to sequence_. @@ -745,8 +544,14 @@ void DBIter::Prev() { } } if (ok) { - PrevInternal(); + Slice prefix; + if (prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix = prefix_.GetUserKey(); + } + PrevInternal(prefix_same_as_start_ ? &prefix : nullptr); } + if (statistics_ != nullptr) { local_stats_.prev_count_++; if (valid_) { @@ -824,21 +629,26 @@ bool DBIter::ReverseToBackward() { return FindUserKeyBeforeSavedKey(); } -void DBIter::PrevInternal() { +void DBIter::PrevInternal(const Slice* prefix) { while (iter_.Valid()) { saved_key_.SetUserKey( ExtractUserKey(iter_.key()), !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); - if (prefix_extractor_ && prefix_same_as_start_ && + assert(prefix == nullptr || prefix_extractor_ != nullptr); + if (prefix != nullptr && prefix_extractor_->Transform(saved_key_.GetUserKey()) - .compare(prefix_start_key_) != 0) { + .compare(*prefix) != 0) { + assert(prefix_same_as_start_); // Current key does not have the same prefix as start valid_ = false; return; } - if (iterate_lower_bound_ != nullptr && + assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() || + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_lower_bound_) >= 0); + if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() && user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < 0) { // We've iterated earlier than the user-specified lower bound. @@ -911,7 +721,7 @@ bool DBIter::FindValueForCurrentKey() { // This user key has lots of entries. // We're going from old to new, and it's taking too long. Let's do a Seek() // and go from new to old. This helps when a key was overwritten many times. - if (num_skipped >= max_skip_ && CanReseekToSkip()) { + if (num_skipped >= max_skip_) { return FindValueForCurrentKeyUsingSeek(); } @@ -1046,6 +856,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { // In case read_callback presents, the value we seek to may not be visible. // Find the next value that's visible. ParsedInternalKey ikey; + is_blob_ = false; while (true) { if (!iter_.Valid()) { valid_ = false; @@ -1087,6 +898,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) { assert(iter_.iter()->IsValuePinned()); pinned_value_ = iter_.value(); + is_blob_ = (ikey.type == kTypeBlobIndex); valid_ = true; return true; } @@ -1208,7 +1020,7 @@ bool DBIter::FindUserKeyBeforeSavedKey() { PERF_COUNTER_ADD(internal_key_skipped_count, 1); } - if (num_skipped >= max_skip_ && CanReseekToSkip()) { + if (num_skipped >= max_skip_) { num_skipped = 0; IterKey last_key; last_key.SetInternalKey(ParsedInternalKey( @@ -1255,20 +1067,39 @@ bool DBIter::IsVisible(SequenceNumber sequence) { } } -bool DBIter::CanReseekToSkip() { - return read_callback_ == nullptr || read_callback_->CanReseekToSkip(); +void DBIter::SetSavedKeyToSeekTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + SequenceNumber seq = sequence_; + saved_key_.Clear(); + saved_key_.SetInternalKey(target, seq); + + if (iterate_lower_bound_ != nullptr && + user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < + 0) { + // Seek key is smaller than the lower bound. + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_lower_bound_, seq); + } +} + +void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) { + is_key_seqnum_zero_ = false; + saved_key_.Clear(); + // now saved_key is used to store internal key. + saved_key_.SetInternalKey(target, 0 /* sequence_number */, + kValueTypeForSeekForPrev); + + if (iterate_upper_bound_ != nullptr && + user_comparator_.Compare(saved_key_.GetUserKey(), + *iterate_upper_bound_) >= 0) { + saved_key_.Clear(); + saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber); + } } void DBIter::Seek(const Slice& target) { PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); StopWatch sw(env_, statistics_, DB_SEEK); - status_ = Status::OK(); - ReleaseTempPinnedData(); - ResetInternalKeysSkippedCounter(); - - SequenceNumber seq = sequence_; - saved_key_.Clear(); - saved_key_.SetInternalKey(target, seq); #ifndef ROCKSDB_LITE if (db_impl_ != nullptr && cfd_ != nullptr) { @@ -1276,101 +1107,112 @@ void DBIter::Seek(const Slice& target) { } #endif // ROCKSDB_LITE - if (iterate_lower_bound_ != nullptr && - user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) < - 0) { - saved_key_.Clear(); - saved_key_.SetInternalKey(*iterate_lower_bound_, seq); - } + status_ = Status::OK(); + ReleaseTempPinnedData(); + ResetInternalKeysSkippedCounter(); + // Seek the inner iterator based on the target key. { PERF_TIMER_GUARD(seek_internal_seek_time); + + SetSavedKeyToSeekTarget(target); iter_.Seek(saved_key_.GetInternalKey()); + range_del_agg_.InvalidateRangeDelMapPositions(); + RecordTick(statistics_, NUMBER_DB_SEEK); } - RecordTick(statistics_, NUMBER_DB_SEEK); - if (iter_.Valid()) { - if (prefix_extractor_ && prefix_same_as_start_) { - prefix_start_key_ = prefix_extractor_->Transform(target); - } - direction_ = kForward; - ClearSavedValue(); - FindNextUserEntry(false /* not skipping */, prefix_same_as_start_); - if (!valid_) { - prefix_start_key_.clear(); - } - if (statistics_ != nullptr) { - if (valid_) { - // Decrement since we don't want to count this key as skipped - RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); - PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); - } + if (!iter_.Valid()) { + valid_ = false; + return; + } + direction_ = kForward; + + // Now the inner iterator is placed to the target position. From there, + // we need to find out the next key that is visible to the user. + // + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix; + target_prefix = prefix_extractor_->Transform(target); + FindNextUserEntry(false /* not skipping saved_key */, + &target_prefix /* prefix */); + if (valid_) { + // Remember the prefix of the seek key for the future Prev() call to + // check. + prefix_.SetUserKey(target_prefix); } } else { - valid_ = false; + FindNextUserEntry(false /* not skipping saved_key */, nullptr); + } + if (!valid_) { + return; } - if (valid_ && prefix_extractor_ && prefix_same_as_start_) { - prefix_start_buf_.SetUserKey(prefix_start_key_); - prefix_start_key_ = prefix_start_buf_.GetUserKey(); + // Updating stats and perf context counters. + if (statistics_ != nullptr) { + // Decrement since we don't want to count this key as skipped + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); } + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); } void DBIter::SeekForPrev(const Slice& target) { PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_); StopWatch sw(env_, statistics_, DB_SEEK); + +#ifndef ROCKSDB_LITE + if (db_impl_ != nullptr && cfd_ != nullptr) { + db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target); + } +#endif // ROCKSDB_LITE + status_ = Status::OK(); ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); - saved_key_.Clear(); - // now saved_key is used to store internal key. - saved_key_.SetInternalKey(target, 0 /* sequence_number */, - kValueTypeForSeekForPrev); - - if (iterate_upper_bound_ != nullptr && - user_comparator_.Compare(saved_key_.GetUserKey(), - *iterate_upper_bound_) >= 0) { - saved_key_.Clear(); - saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber); - } + // Seek the inner iterator based on the target key. { PERF_TIMER_GUARD(seek_internal_seek_time); + SetSavedKeyToSeekForPrevTarget(target); iter_.SeekForPrev(saved_key_.GetInternalKey()); range_del_agg_.InvalidateRangeDelMapPositions(); + RecordTick(statistics_, NUMBER_DB_SEEK); } - -#ifndef ROCKSDB_LITE - if (db_impl_ != nullptr && cfd_ != nullptr) { - db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target); + if (!iter_.Valid()) { + valid_ = false; + return; } -#endif // ROCKSDB_LITE + direction_ = kReverse; - RecordTick(statistics_, NUMBER_DB_SEEK); - if (iter_.Valid()) { - if (prefix_extractor_ && prefix_same_as_start_) { - prefix_start_key_ = prefix_extractor_->Transform(target); - } - direction_ = kReverse; - ClearSavedValue(); - PrevInternal(); - if (!valid_) { - prefix_start_key_.clear(); - } - if (statistics_ != nullptr) { - if (valid_) { - RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); - PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); - } + // Now the inner iterator is placed to the target position. From there, + // we need to find out the first key that is visible to the user in the + // backward direction. + ClearSavedValue(); + if (prefix_same_as_start_) { + // The case where the iterator needs to be invalidated if it has exausted + // keys within the same prefix of the seek key. + assert(prefix_extractor_ != nullptr); + Slice target_prefix; + target_prefix = prefix_extractor_->Transform(target); + PrevInternal(&target_prefix); + if (valid_) { + // Remember the prefix of the seek key for the future Prev() call to + // check. + prefix_.SetUserKey(target_prefix); } } else { - valid_ = false; + PrevInternal(nullptr); } - if (valid_ && prefix_extractor_ && prefix_same_as_start_) { - prefix_start_buf_.SetUserKey(prefix_start_key_); - prefix_start_key_ = prefix_start_buf_.GetUserKey(); + + // Report stats and perf context. + if (statistics_ != nullptr && valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); } } @@ -1390,6 +1232,7 @@ void DBIter::SeekToFirst() { ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); + is_key_seqnum_zero_ = false; { PERF_TIMER_GUARD(seek_internal_seek_time); @@ -1402,7 +1245,8 @@ void DBIter::SeekToFirst() { saved_key_.SetUserKey( ExtractUserKey(iter_.key()), !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); - FindNextUserEntry(false /* not skipping */, false /* no prefix check */); + FindNextUserEntry(false /* not skipping saved_key */, + nullptr /* no prefix check */); if (statistics_ != nullptr) { if (valid_) { RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); @@ -1413,10 +1257,9 @@ void DBIter::SeekToFirst() { } else { valid_ = false; } - if (valid_ && prefix_extractor_ && prefix_same_as_start_) { - prefix_start_buf_.SetUserKey( - prefix_extractor_->Transform(saved_key_.GetUserKey())); - prefix_start_key_ = prefix_start_buf_.GetUserKey(); + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); } } @@ -1426,7 +1269,7 @@ void DBIter::SeekToLast() { SeekForPrev(*iterate_upper_bound_); if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) { ReleaseTempPinnedData(); - PrevInternal(); + PrevInternal(nullptr); } return; } @@ -1442,13 +1285,14 @@ void DBIter::SeekToLast() { ReleaseTempPinnedData(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); + is_key_seqnum_zero_ = false; { PERF_TIMER_GUARD(seek_internal_seek_time); iter_.SeekToLast(); range_del_agg_.InvalidateRangeDelMapPositions(); } - PrevInternal(); + PrevInternal(nullptr); if (statistics_ != nullptr) { RecordTick(statistics_, NUMBER_DB_SEEK); if (valid_) { @@ -1457,10 +1301,9 @@ void DBIter::SeekToLast() { PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size()); } } - if (valid_ && prefix_extractor_ && prefix_same_as_start_) { - prefix_start_buf_.SetUserKey( - prefix_extractor_->Transform(saved_key_.GetUserKey())); - prefix_start_key_ = prefix_start_buf_.GetUserKey(); + if (valid_ && prefix_same_as_start_) { + assert(prefix_extractor_ != nullptr); + prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey())); } } @@ -1480,114 +1323,4 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, return db_iter; } -ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } - -ReadRangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() { - return db_iter_->GetRangeDelAggregator(); -} - -void ArenaWrappedDBIter::SetIterUnderDBIter(InternalIterator* iter) { - static_cast(db_iter_)->SetIter(iter); -} - -inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); } -inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); } -inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); } -inline void ArenaWrappedDBIter::Seek(const Slice& target) { - db_iter_->Seek(target); -} -inline void ArenaWrappedDBIter::SeekForPrev(const Slice& target) { - db_iter_->SeekForPrev(target); -} -inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); } -inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); } -inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); } -inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); } -inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); } -bool ArenaWrappedDBIter::IsBlob() const { return db_iter_->IsBlob(); } -inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name, - std::string* prop) { - if (prop_name == "rocksdb.iterator.super-version-number") { - // First try to pass the value returned from inner iterator. - if (!db_iter_->GetProperty(prop_name, prop).ok()) { - *prop = ToString(sv_number_); - } - return Status::OK(); - } - return db_iter_->GetProperty(prop_name, prop); -} - -void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iteration, - uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl, - ColumnFamilyData* cfd, bool allow_blob, - bool allow_refresh) { - auto mem = arena_.AllocateAligned(sizeof(DBIter)); - db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options, - cf_options.user_comparator, nullptr, sequence, - true, max_sequential_skip_in_iteration, - read_callback, db_impl, cfd, allow_blob); - sv_number_ = version_number; - allow_refresh_ = allow_refresh; -} - -Status ArenaWrappedDBIter::Refresh() { - if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) { - return Status::NotSupported("Creating renew iterator is not allowed."); - } - assert(db_iter_ != nullptr); - // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the - // correct behavior. Will be corrected automatically when we take a snapshot - // here for the case of WritePreparedTxnDB. - SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); - uint64_t cur_sv_number = cfd_->GetSuperVersionNumber(); - if (sv_number_ != cur_sv_number) { - Env* env = db_iter_->env(); - db_iter_->~DBIter(); - arena_.~Arena(); - new (&arena_) Arena(); - - SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex()); - if (read_callback_) { - read_callback_->Refresh(latest_seq); - } - Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, - latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, - cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_, - allow_refresh_); - - InternalIterator* internal_iter = db_impl_->NewInternalIterator( - read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(), - latest_seq); - SetIterUnderDBIter(internal_iter); - } else { - db_iter_->set_sequence(latest_seq); - db_iter_->set_valid(false); - } - return Status::OK(); -} - -ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob, bool allow_refresh) { - ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); - iter->Init(env, read_options, cf_options, mutable_cf_options, sequence, - max_sequential_skip_in_iterations, version_number, read_callback, - db_impl, cfd, allow_blob, allow_refresh); - if (db_impl != nullptr && cfd != nullptr && allow_refresh) { - iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback, - allow_blob); - } - - return iter; -} - } // namespace rocksdb diff --git a/db/db_iter.h b/db/db_iter.h index a640f0296e5..e6e072c5051 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -10,22 +10,321 @@ #pragma once #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/range_del_aggregator.h" +#include "memory/arena.h" #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" -#include "util/arena.h" +#include "table/iterator_wrapper.h" #include "util/autovector.h" namespace rocksdb { -class Arena; -class DBIter; +// This file declares the factory functions of DBIter, in its original form +// or a wrapped form with class ArenaWrappedDBIter, which is defined here. +// Class DBIter, which is declared and implemented inside db_iter.cc, is +// a iterator that converts internal keys (yielded by an InternalIterator) +// that were live at the specified sequence number into appropriate user +// keys. +// Each internal key is consist of a user key, a sequence number, and a value +// type. DBIter deals with multiple key versions, tombstones, merge operands, +// etc, and exposes an Iterator. +// For example, DBIter may wrap following InternalIterator: +// user key: AAA value: v3 seqno: 100 type: Put +// user key: AAA value: v2 seqno: 97 type: Put +// user key: AAA value: v1 seqno: 95 type: Put +// user key: BBB value: v1 seqno: 90 type: Put +// user key: BBC value: N/A seqno: 98 type: Delete +// user key: BBC value: v1 seqno: 95 type: Put +// If the snapshot passed in is 102, then the DBIter is expected to +// expose the following iterator: +// key: AAA value: v3 +// key: BBB value: v1 +// If the snapshot passed in is 96, then it should expose: +// key: AAA value: v1 +// key: BBB value: v1 +// key: BBC value: v1 +// + +// Memtables and sstables that make the DB representation contain +// (userkey,seq,type) => uservalue entries. DBIter +// combines multiple entries for the same userkey found in the DB +// representation into a single entry while accounting for sequence +// numbers, deletion markers, overwrites, etc. +class DBIter final : public Iterator { + public: + // The following is grossly complicated. TODO: clean it up + // Which direction is the iterator currently moving? + // (1) When moving forward: + // (1a) if current_entry_is_merged_ = false, the internal iterator is + // positioned at the exact entry that yields this->key(), this->value() + // (1b) if current_entry_is_merged_ = true, the internal iterator is + // positioned immediately after the last entry that contributed to the + // current this->value(). That entry may or may not have key equal to + // this->key(). + // (2) When moving backwards, the internal iterator is positioned + // just before all entries whose user key == this->key(). + enum Direction { kForward, kReverse }; + + // LocalStatistics contain Statistics counters that will be aggregated per + // each iterator instance and then will be sent to the global statistics when + // the iterator is destroyed. + // + // The purpose of this approach is to avoid perf regression happening + // when multiple threads bump the atomic counters from a DBIter::Next(). + struct LocalStatistics { + explicit LocalStatistics() { ResetCounters(); } + + void ResetCounters() { + next_count_ = 0; + next_found_count_ = 0; + prev_count_ = 0; + prev_found_count_ = 0; + bytes_read_ = 0; + skip_count_ = 0; + } + + void BumpGlobalStatistics(Statistics* global_statistics) { + RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_); + RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_); + RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_); + RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_); + RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_); + RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_); + PERF_COUNTER_ADD(iter_read_bytes, bytes_read_); + ResetCounters(); + } + + // Map to Tickers::NUMBER_DB_NEXT + uint64_t next_count_; + // Map to Tickers::NUMBER_DB_NEXT_FOUND + uint64_t next_found_count_; + // Map to Tickers::NUMBER_DB_PREV + uint64_t prev_count_; + // Map to Tickers::NUMBER_DB_PREV_FOUND + uint64_t prev_found_count_; + // Map to Tickers::ITER_BYTES_READ + uint64_t bytes_read_; + // Map to Tickers::NUMBER_ITER_SKIP + uint64_t skip_count_; + }; + + DBIter(Env* _env, const ReadOptions& read_options, + const ImmutableCFOptions& cf_options, + const MutableCFOptions& mutable_cf_options, const Comparator* cmp, + InternalIterator* iter, SequenceNumber s, bool arena_mode, + uint64_t max_sequential_skip_in_iterations, + ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, + bool allow_blob); + + // No copying allowed + DBIter(const DBIter&) = delete; + void operator=(const DBIter&) = delete; + + ~DBIter() override { + // Release pinned data if any + if (pinned_iters_mgr_.PinningEnabled()) { + pinned_iters_mgr_.ReleasePinnedData(); + } + RecordTick(statistics_, NO_ITERATOR_DELETED); + ResetInternalKeysSkippedCounter(); + local_stats_.BumpGlobalStatistics(statistics_); + iter_.DeleteIter(arena_mode_); + } + virtual void SetIter(InternalIterator* iter) { + assert(iter_.iter() == nullptr); + iter_.Set(iter); + iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_); + } + virtual ReadRangeDelAggregator* GetRangeDelAggregator() { + return &range_del_agg_; + } + + bool Valid() const override { return valid_; } + Slice key() const override { + assert(valid_); + if (start_seqnum_ > 0) { + return saved_key_.GetInternalKey(); + } else { + return saved_key_.GetUserKey(); + } + } + Slice value() const override { + assert(valid_); + if (current_entry_is_merged_) { + // If pinned_value_ is set then the result of merge operator is one of + // the merge operands and we should return it. + return pinned_value_.data() ? pinned_value_ : saved_value_; + } else if (direction_ == kReverse) { + return pinned_value_; + } else { + return iter_.value(); + } + } + Status status() const override { + if (status_.ok()) { + return iter_.status(); + } else { + assert(!valid_); + return status_; + } + } + bool IsBlob() const { + assert(valid_ && (allow_blob_ || !is_blob_)); + return is_blob_; + } + + Status GetProperty(std::string prop_name, std::string* prop) override; + + void Next() final override; + void Prev() final override; + void Seek(const Slice& target) final override; + void SeekForPrev(const Slice& target) final override; + void SeekToFirst() final override; + void SeekToLast() final override; + Env* env() { return env_; } + void set_sequence(uint64_t s) { + sequence_ = s; + if (read_callback_) { + read_callback_->Refresh(s); + } + } + void set_valid(bool v) { valid_ = v; } + + private: + // For all methods in this block: + // PRE: iter_->Valid() && status_.ok() + // Return false if there was an error, and status() is non-ok, valid_ = false; + // in this case callers would usually stop what they were doing and return. + bool ReverseToForward(); + bool ReverseToBackward(); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is smaller than iterator lower bound. + void SetSavedKeyToSeekTarget(const Slice& /*target*/); + // Set saved_key_ to the seek key to target, with proper sequence number set. + // It might get adjusted if the seek key is larger than iterator upper bound. + void SetSavedKeyToSeekForPrevTarget(const Slice& /*target*/); + bool FindValueForCurrentKey(); + bool FindValueForCurrentKeyUsingSeek(); + bool FindUserKeyBeforeSavedKey(); + // If `skipping_saved_key` is true, the function will keep iterating until it + // finds a user key that is larger than `saved_key_`. + // If `prefix` is not null, the iterator needs to stop when all keys for the + // prefix are exhausted and the interator is set to invalid. + bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix); + // Internal implementation of FindNextUserEntry(). + bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix); + bool ParseKey(ParsedInternalKey* key); + bool MergeValuesNewToOld(); + + // If prefix is not null, we need to set the iterator to invalid if no more + // entry can be found within the prefix. + void PrevInternal(const Slice* /*prefix*/); + bool TooManyInternalKeysSkipped(bool increment = true); + bool IsVisible(SequenceNumber sequence); + // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData() + // is called + void TempPinData() { + if (!pin_thru_lifetime_) { + pinned_iters_mgr_.StartPinning(); + } + } + + // Release blocks pinned by TempPinData() + void ReleaseTempPinnedData() { + if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) { + pinned_iters_mgr_.ReleasePinnedData(); + } + } + + inline void ClearSavedValue() { + if (saved_value_.capacity() > 1048576) { + std::string empty; + swap(empty, saved_value_); + } else { + saved_value_.clear(); + } + } + + inline void ResetInternalKeysSkippedCounter() { + local_stats_.skip_count_ += num_internal_keys_skipped_; + if (valid_) { + local_stats_.skip_count_--; + } + num_internal_keys_skipped_ = 0; + } + + const SliceTransform* prefix_extractor_; + Env* const env_; + Logger* logger_; + UserComparatorWrapper user_comparator_; + const MergeOperator* const merge_operator_; + IteratorWrapper iter_; + ReadCallback* read_callback_; + // Max visible sequence number. It is normally the snapshot seq unless we have + // uncommitted data in db as in WriteUnCommitted. + SequenceNumber sequence_; + + IterKey saved_key_; + // Reusable internal key data structure. This is only used inside one function + // and should not be used across functions. Reusing this object can reduce + // overhead of calling construction of the function if creating it each time. + ParsedInternalKey ikey_; + std::string saved_value_; + Slice pinned_value_; + // for prefix seek mode to support prev() + Statistics* statistics_; + uint64_t max_skip_; + uint64_t max_skippable_internal_keys_; + uint64_t num_internal_keys_skipped_; + const Slice* iterate_lower_bound_; + const Slice* iterate_upper_bound_; + + // The prefix of the seek key. It is only used when prefix_same_as_start_ + // is true and prefix extractor is not null. In Next() or Prev(), current keys + // will be checked against this prefix, so that the iterator can be + // invalidated if the keys in this prefix has been exhausted. Set it using + // SetUserKey() and use it using GetUserKey(). + IterKey prefix_; + + Status status_; + Direction direction_; + bool valid_; + bool current_entry_is_merged_; + // True if we know that the current entry's seqnum is 0. + // This information is used as that the next entry will be for another + // user key. + bool is_key_seqnum_zero_; + const bool prefix_same_as_start_; + // Means that we will pin all data blocks we read as long the Iterator + // is not deleted, will be true if ReadOptions::pin_data is true + const bool pin_thru_lifetime_; + const bool total_order_seek_; + bool allow_blob_; + bool is_blob_; + bool arena_mode_; + // List of operands for merge operator. + MergeContext merge_context_; + ReadRangeDelAggregator range_del_agg_; + LocalStatistics local_stats_; + PinnedIteratorsManager pinned_iters_mgr_; +#ifdef ROCKSDB_LITE + ROCKSDB_FIELD_UNUSED +#endif + DBImpl* db_impl_; +#ifdef ROCKSDB_LITE + ROCKSDB_FIELD_UNUSED +#endif + ColumnFamilyData* cfd_; + // for diff snapshots we want the lower bound on the seqnum; + // if this value > 0 iterator will return internal keys + SequenceNumber start_seqnum_; +}; // Return a new iterator that converts internal keys (yielded by -// "*internal_iter") that were live at the specified "sequence" number +// "*internal_iter") that were live at the specified `sequence` number // into appropriate user keys. extern Iterator* NewDBIterator( Env* env, const ReadOptions& read_options, @@ -36,77 +335,4 @@ extern Iterator* NewDBIterator( ReadCallback* read_callback, DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr, bool allow_blob = false); -// A wrapper iterator which wraps DB Iterator and the arena, with which the DB -// iterator is supposed be allocated. This class is used as an entry point of -// a iterator hierarchy whose memory can be allocated inline. In that way, -// accessing the iterator tree can be more cache friendly. It is also faster -// to allocate. -class ArenaWrappedDBIter : public Iterator { - public: - virtual ~ArenaWrappedDBIter(); - - // Get the arena to be used to allocate memory for DBIter to be wrapped, - // as well as child iterators in it. - virtual Arena* GetArena() { return &arena_; } - virtual ReadRangeDelAggregator* GetRangeDelAggregator(); - - // Set the internal iterator wrapped inside the DB Iterator. Usually it is - // a merging iterator. - virtual void SetIterUnderDBIter(InternalIterator* iter); - virtual bool Valid() const override; - virtual void SeekToFirst() override; - virtual void SeekToLast() override; - virtual void Seek(const Slice& target) override; - virtual void SeekForPrev(const Slice& target) override; - virtual void Next() override; - virtual void Prev() override; - virtual Slice key() const override; - virtual Slice value() const override; - virtual Status status() const override; - virtual Status Refresh() override; - bool IsBlob() const; - - virtual Status GetProperty(std::string prop_name, std::string* prop) override; - - void Init(Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd, - bool allow_blob, bool allow_refresh); - - void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl, - ColumnFamilyData* cfd, ReadCallback* read_callback, - bool allow_blob) { - read_options_ = read_options; - db_impl_ = db_impl; - cfd_ = cfd; - read_callback_ = read_callback; - allow_blob_ = allow_blob; - } - - private: - DBIter* db_iter_; - Arena arena_; - uint64_t sv_number_; - ColumnFamilyData* cfd_ = nullptr; - DBImpl* db_impl_ = nullptr; - ReadOptions read_options_; - ReadCallback* read_callback_; - bool allow_blob_ = false; - bool allow_refresh_ = true; -}; - -// Generate the arena wrapped iterator class. -// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not -// be supported. -extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const ReadOptions& read_options, - const ImmutableCFOptions& cf_options, - const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, uint64_t version_number, - ReadCallback* read_callback, DBImpl* db_impl = nullptr, - ColumnFamilyData* cfd = nullptr, bool allow_blob = false, - bool allow_refresh = true); } // namespace rocksdb diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc index a0f1dfeab45..b864ac4eae1 100644 --- a/db/db_iter_stress_test.cc +++ b/db/db_iter_stress_test.cc @@ -8,9 +8,9 @@ #include "rocksdb/comparator.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "test_util/testharness.h" #include "util/random.h" #include "util/string_util.h" -#include "util/testharness.h" #include "utilities/merge_operators.h" #ifdef GFLAGS diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 29fbd320861..1503886443b 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -17,9 +17,9 @@ #include "rocksdb/statistics.h" #include "table/iterator_wrapper.h" #include "table/merging_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index ec5fc8006b8..6abe40b276b 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -9,13 +9,14 @@ #include +#include "db/arena_wrapped_db_iter.h" #include "db/db_iter.h" #include "db/db_test_util.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" -#include "table/flush_block_policy.h" +#include "table/block_based/flush_block_policy.h" namespace rocksdb { @@ -182,6 +183,33 @@ TEST_P(DBIteratorTest, IterSeekBeforePrev) { delete iter; } +TEST_P(DBIteratorTest, IterReseekNewUpperBound) { + Random rnd(301); + Options options = CurrentOptions(); + BlockBasedTableOptions table_options; + table_options.block_size = 1024; + table_options.block_size_deviation = 50; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.compression = kNoCompression; + Reopen(options); + + ASSERT_OK(Put("a", RandomString(&rnd, 400))); + ASSERT_OK(Put("aabb", RandomString(&rnd, 400))); + ASSERT_OK(Put("aaef", RandomString(&rnd, 400))); + ASSERT_OK(Put("b", RandomString(&rnd, 400))); + dbfull()->Flush(FlushOptions()); + ReadOptions opts; + Slice ub = Slice("aa"); + opts.iterate_upper_bound = &ub; + auto iter = NewIterator(opts); + iter->Seek(Slice("a")); + ub = Slice("b"); + iter->Seek(Slice("aabc")); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "aaef"); + delete iter; +} + TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) { ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); @@ -1049,6 +1077,149 @@ TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { ASSERT_EQ(upper_bound_hits, 1); } } + +// Enable kBinarySearchWithFirstKey, do some iterator operations and check that +// they don't do unnecessary block reads. +TEST_P(DBIteratorTest, IndexWithFirstKey) { + for (int tailing = 0; tailing < 2; ++tailing) { + SCOPED_TRACE("tailing = " + std::to_string(tailing)); + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = rocksdb::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = + NewLRUCache(8000); // fits all blocks and their cache metadata overhead + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a1", "x1")); + ASSERT_OK(Merge("b1", "y1")); + ASSERT_OK(Merge("c0", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a2", "x2")); + ASSERT_OK(Merge("b2", "y2")); + ASSERT_OK(Merge("c0", "z2")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("a3", "x3")); + ASSERT_OK(Merge("b3", "y3")); + ASSERT_OK(Merge("c3", "z3")); + ASSERT_OK(Flush()); + + // Block cache is not important for this test. + // We use BLOCK_CACHE_DATA_* counters just because they're the most readily + // available way of counting block accesses. + + ReadOptions ropt; + ropt.tailing = tailing; + std::unique_ptr iter(NewIterator(ropt)); + + iter->Seek("b10"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b3", iter->key().ToString()); + EXPECT_EQ("y3", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Seek("c0"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c0", iter->key().ToString()); + EXPECT_EQ("z1,z2", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("c3", iter->key().ToString()); + EXPECT_EQ("z3", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter.reset(); + + // Enable iterate_upper_bound and check that iterator is not trying to read + // blocks that are fully above upper bound. + std::string ub = "b3"; + Slice ub_slice(ub); + ropt.iterate_upper_bound = &ub_slice; + iter.reset(NewIterator(ropt)); + + iter->Seek("b2"); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ("b2", iter->key().ToString()); + EXPECT_EQ("y2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + } +} + +TEST_P(DBIteratorTest, IndexWithFirstKeyGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.statistics = rocksdb::CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + BlockBasedTableOptions table_options; + table_options.index_type = + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey; + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(); + table_options.block_cache = NewLRUCache(1000); // fits all blocks + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + ASSERT_OK(Merge("a", "x1")); + ASSERT_OK(Merge("c", "y1")); + ASSERT_OK(Merge("e", "z1")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("c", "y2")); + ASSERT_OK(Merge("e", "z2")); + ASSERT_OK(Flush()); + + // Get() between blocks shouldn't read any blocks. + ASSERT_EQ("NOT_FOUND", Get("b")); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Get() of an existing key shouldn't read any unnecessary blocks when there's + // only one key per block. + + ASSERT_EQ("y1,y2", Get("c")); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + ASSERT_EQ("x1", Get("a")); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + EXPECT_EQ(std::vector({"NOT_FOUND", "z1,z2"}), + MultiGet({"b", "e"})); +} + // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary // return the biggest key which is smaller than the seek key. TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) { @@ -2450,6 +2621,192 @@ TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) { ASSERT_EQ("a", it->key().ToString()); } +TEST_P(DBIteratorTest, AvoidReseekLevelIterator) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 800; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + Random rnd(301); + std::string random_str = RandomString(&rnd, 180); + + ASSERT_OK(Put("1", random_str)); + ASSERT_OK(Put("2", random_str)); + ASSERT_OK(Put("3", random_str)); + ASSERT_OK(Put("4", random_str)); + // A new block + ASSERT_OK(Put("5", random_str)); + ASSERT_OK(Put("6", random_str)); + ASSERT_OK(Put("7", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("8", random_str)); + ASSERT_OK(Put("9", random_str)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + int num_find_file_in_level = 0; + int num_idx_blk_seek = 0; + SyncPoint::GetInstance()->SetCallBack( + "LevelIterator::Seek:BeforeFindFile", + [&](void* /*arg*/) { num_find_file_in_level++; }); + SyncPoint::GetInstance()->SetCallBack( + "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + { + std::unique_ptr iter(NewIterator(ReadOptions())); + iter->Seek("1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("3"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(1, num_idx_blk_seek); + + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(2, num_idx_blk_seek); + + iter->Seek("6"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(2, num_idx_blk_seek); + + iter->Seek("7"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(1, num_find_file_in_level); + ASSERT_EQ(3, num_idx_blk_seek); + + iter->Seek("8"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(2, num_find_file_in_level); + // Still re-seek because "8" is the boundary key, which has + // the same user key as the seek key. + ASSERT_EQ(4, num_idx_blk_seek); + + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(5, num_idx_blk_seek); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(5, num_idx_blk_seek); + + // Seek backward never triggers the index block seek to be skipped + iter->Seek("5"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(3, num_find_file_in_level); + ASSERT_EQ(6, num_idx_blk_seek); + } + + SyncPoint::GetInstance()->DisableProcessing(); +} + +// MyRocks may change iterate bounds before seek. Simply test to make sure such +// usage doesn't break iterator. +TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) { + Options options = CurrentOptions(); + options.compression = CompressionType::kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::string value(50, 'v'); + Reopen(options); + ASSERT_OK(Put("aaa", value)); + ASSERT_OK(Flush()); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("eee", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + std::string ub1 = "e"; + std::string ub2 = "c"; + Slice ub(ub1); + ReadOptions read_opts1; + read_opts1.iterate_upper_bound = &ub; + Iterator* iter = NewIterator(read_opts1); + // Seek and iterate accross block boundary. + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + ub = Slice(ub2); + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; + + std::string lb1 = "a"; + std::string lb2 = "c"; + Slice lb(lb1); + ReadOptions read_opts2; + read_opts2.iterate_lower_bound = &lb; + iter = NewIterator(read_opts2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + lb = Slice(lb2); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + delete iter; +} + +TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) { + ASSERT_OK(Put("aaa", "v")); + ASSERT_OK(Put("bbb", "v")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ccc", "v")); + ASSERT_OK(Put("ddd", "v")); + ASSERT_OK(Flush()); + // Move both files to bottom level. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + Slice lower_bound("b"); + ReadOptions read_opts; + read_opts.iterate_lower_bound = &lower_bound; + std::unique_ptr iter(NewIterator(read_opts)); + iter->SeekForPrev("d"); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("ccc", iter->key()); + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ("bbb", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest, testing::Values(true, false)); diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 294d0f581bc..d9ad649e736 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -204,6 +204,76 @@ TEST_F(DBMemTableTest, DuplicateSeq) { delete mem; } +// A simple test to verify that the concurrent merge writes is functional +TEST_F(DBMemTableTest, ConcurrentMergeWrite) { + int num_ops = 1000; + std::string value; + Status s; + MergeContext merge_context; + Options options; + // A merge operator that is not sensitive to concurrent writes since in this + // test we don't order the writes. + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + + // Create a MemTable + InternalKeyComparator cmp(BytewiseComparator()); + auto factory = std::make_shared(); + options.memtable_factory = factory; + options.allow_concurrent_memtable_write = true; + ImmutableCFOptions ioptions(options); + WriteBufferManager wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + + // Put 0 as the base + PutFixed64(&value, static_cast(0)); + bool res = mem->Add(0, kTypeValue, "key", value); + ASSERT_TRUE(res); + value.clear(); + + // Write Merge concurrently + rocksdb::port::Thread write_thread1([&]() { + MemTablePostProcessInfo post_process_info1; + std::string v1; + for (int seq = 1; seq < num_ops / 2; seq++) { + PutFixed64(&v1, seq); + bool res1 = + mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1); + ASSERT_TRUE(res1); + v1.clear(); + } + }); + rocksdb::port::Thread write_thread2([&]() { + MemTablePostProcessInfo post_process_info2; + std::string v2; + for (int seq = num_ops / 2; seq < num_ops; seq++) { + PutFixed64(&v2, seq); + bool res2 = + mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2); + ASSERT_TRUE(res2); + v2.clear(); + } + }); + write_thread1.join(); + write_thread2.join(); + + Status status; + ReadOptions roptions; + SequenceNumber max_covering_tombstone_seq = 0; + LookupKey lkey("key", kMaxSequenceNumber); + res = mem->Get(lkey, &value, &status, &merge_context, + &max_covering_tombstone_seq, roptions); + ASSERT_TRUE(res); + uint64_t ivalue = DecodeFixed64(Slice(value).data()); + uint64_t sum = 0; + for (int seq = 0; seq < num_ops; seq++) { + sum += seq; + } + ASSERT_EQ(ivalue, sum); + + delete mem; +} + TEST_F(DBMemTableTest, InsertWithHint) { Options options; options.allow_concurrent_memtable_write = false; @@ -252,7 +322,7 @@ TEST_F(DBMemTableTest, ColumnFamilyId) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - for (int cf = 0; cf < 2; ++cf) { + for (uint32_t cf = 0; cf < 2; ++cf) { ASSERT_OK(Put(cf, "key", "val")); ASSERT_OK(Flush(cf)); ASSERT_EQ( diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc new file mode 100644 index 00000000000..e6280ad8c79 --- /dev/null +++ b/db/db_merge_operand_test.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/utilities/debug.h" +#include "table/block_based/block_builder.h" +#include "test_util/fault_injection_test_env.h" +#if !defined(ROCKSDB_LITE) +#include "test_util/sync_point.h" +#endif +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/sortlist.h" +#include "utilities/merge_operators/string_append/stringappend2.h" + +namespace rocksdb { + +class DBMergeOperandTest : public DBTestBase { + public: + DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {} +}; + +TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { + class LimitedStringAppendMergeOp : public StringAppendTESTOperator { + public: + LimitedStringAppendMergeOp(int limit, char delim) + : StringAppendTESTOperator(delim), limit_(limit) {} + + const char* Name() const override { + return "DBMergeOperatorTest::LimitedStringAppendMergeOp"; + } + + bool ShouldMerge(const std::vector& operands) const override { + if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) { + return true; + } + return false; + } + + private: + size_t limit_ = 0; + }; + + Options options; + options.create_if_missing = true; + // Use only the latest two merge operands. + options.merge_operator = std::make_shared(2, ','); + options.env = env_; + Reopen(options); + int num_records = 4; + int number_of_operands = 0; + std::vector values(num_records); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = num_records; + + // k0 value in memtable + Put("k0", "PutARock"); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "PutARock"); + + // k0.1 value in SST + Put("k0.1", "RockInSST"); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "RockInSST"); + + // All k1 values are in memtable. + ASSERT_OK(Merge("k1", "a")); + Put("k1", "x"); + ASSERT_OK(Merge("k1", "b")); + ASSERT_OK(Merge("k1", "c")); + ASSERT_OK(Merge("k1", "d")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "x"); + ASSERT_EQ(values[1], "b"); + ASSERT_EQ(values[2], "c"); + ASSERT_EQ(values[3], "d"); + + // expected_max_number_of_operands is less than number of merge operands so + // status should be Incomplete. + merge_operands_info.expected_max_number_of_operands = num_records - 1; + Status status = db_->GetMergeOperands( + ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), + &merge_operands_info, &number_of_operands); + ASSERT_EQ(status.IsIncomplete(), true); + merge_operands_info.expected_max_number_of_operands = num_records; + + // All k1.1 values are in memtable. + ASSERT_OK(Merge("k1.1", "r")); + Delete("k1.1"); + ASSERT_OK(Merge("k1.1", "c")); + ASSERT_OK(Merge("k1.1", "k")); + ASSERT_OK(Merge("k1.1", "s")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "c"); + ASSERT_EQ(values[1], "k"); + ASSERT_EQ(values[2], "s"); + + // All k2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2", "q")); + ASSERT_OK(Merge("k2", "w")); + ASSERT_OK(Merge("k2", "e")); + ASSERT_OK(Merge("k2", "r")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "q"); + ASSERT_EQ(values[1], "w"); + ASSERT_EQ(values[2], "e"); + ASSERT_EQ(values[3], "r"); + + // All k2.1 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2.1", "m")); + Put("k2.1", "l"); + ASSERT_OK(Merge("k2.1", "n")); + ASSERT_OK(Merge("k2.1", "o")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "l,n,o"); + + // All k2.2 values are flushed to L0 into a single file. + ASSERT_OK(Merge("k2.2", "g")); + Delete("k2.2"); + ASSERT_OK(Merge("k2.2", "o")); + ASSERT_OK(Merge("k2.2", "t")); + ASSERT_OK(Flush()); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "o,t"); + + // Do some compaction that will make the following tests more predictable + // Slice start("PutARock"); + // Slice end("t"); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // All k3 values are flushed and are in different files. + ASSERT_OK(Merge("k3", "ab")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "bc")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "ab"); + ASSERT_EQ(values[1], "bc"); + ASSERT_EQ(values[2], "cd"); + ASSERT_EQ(values[3], "de"); + + // All k3.1 values are flushed and are in different files. + ASSERT_OK(Merge("k3.1", "ab")); + ASSERT_OK(Flush()); + Put("k3.1", "bc"); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.1", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.1", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "bc"); + ASSERT_EQ(values[1], "cd"); + ASSERT_EQ(values[2], "de"); + + // All k3.2 values are flushed and are in different files. + ASSERT_OK(Merge("k3.2", "ab")); + ASSERT_OK(Flush()); + Delete("k3.2"); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.2", "cd")); + ASSERT_OK(Flush()); + ASSERT_OK(Merge("k3.2", "de")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "cd"); + ASSERT_EQ(values[1], "de"); + + // All K4 values are in different levels + ASSERT_OK(Merge("k4", "ba")); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + ASSERT_OK(Merge("k4", "cb")); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Merge("k4", "dc")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(Merge("k4", "ed")); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "ba"); + ASSERT_EQ(values[1], "cb"); + ASSERT_EQ(values[2], "dc"); + ASSERT_EQ(values[3], "ed"); + + // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable + ASSERT_OK(Merge("k5", "who")); + ASSERT_OK(Merge("k5", "am")); + ASSERT_OK(Merge("k5", "i")); + ASSERT_OK(Flush()); + Put("k5", "remember"); + ASSERT_OK(Merge("k5", "i")); + ASSERT_OK(Merge("k5", "am")); + ASSERT_OK(Merge("k5", "rocks")); + dbfull()->TEST_SwitchMemtable(); + db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5", + values.data(), &merge_operands_info, + &number_of_operands); + ASSERT_EQ(values[0], "remember"); + ASSERT_EQ(values[1], "i"); + ASSERT_EQ(values[2], "am"); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc index 2b5e4a445ea..8358ddb56c2 100644 --- a/db/db_merge_operator_test.cc +++ b/db/db_merge_operator_test.cc @@ -46,9 +46,11 @@ class DBMergeOperatorTest : public DBTestBase { ReadOptions read_opt; read_opt.snapshot = snapshot; PinnableSlice value; - Status s = - dbfull()->GetImpl(read_opt, db_->DefaultColumnFamily(), key, &value, - nullptr /*value_found*/, &read_callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = db_->DefaultColumnFamily(); + get_impl_options.value = &value; + get_impl_options.callback = &read_callback; + Status s = dbfull()->GetImpl(read_opt, key, get_impl_options); if (!s.ok()) { return s.ToString(); } @@ -275,68 +277,6 @@ TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) { VerifyDBFromMap(true_data); } -TEST_P(MergeOperatorPinningTest, Randomized) { - do { - Options options = CurrentOptions(); - options.merge_operator = MergeOperators::CreateMaxOperator(); - BlockBasedTableOptions table_options; - table_options.no_block_cache = disable_block_cache_; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - - Random rnd(301); - std::map true_data; - - const int kTotalMerges = 5000; - // Every key gets ~10 operands - const int kKeyRange = kTotalMerges / 10; - const int kOperandSize = 20; - const int kNumPutBefore = kKeyRange / 10; // 10% value - const int kNumPutAfter = kKeyRange / 10; // 10% overwrite - const int kNumDelete = kKeyRange / 10; // 10% delete - - // kNumPutBefore keys will have base values - for (int i = 0; i < kNumPutBefore; i++) { - std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); - ASSERT_OK(db_->Put(WriteOptions(), key, value)); - - true_data[key] = value; - } - - // Do kTotalMerges merges - for (int i = 0; i < kTotalMerges; i++) { - std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); - ASSERT_OK(db_->Merge(WriteOptions(), key, value)); - - if (true_data[key] < value) { - true_data[key] = value; - } - } - - // Overwrite random kNumPutAfter keys - for (int i = 0; i < kNumPutAfter; i++) { - std::string key = Key(rnd.Next() % kKeyRange); - std::string value = RandomString(&rnd, kOperandSize); - ASSERT_OK(db_->Put(WriteOptions(), key, value)); - - true_data[key] = value; - } - - // Delete random kNumDelete keys - for (int i = 0; i < kNumDelete; i++) { - std::string key = Key(rnd.Next() % kKeyRange); - ASSERT_OK(db_->Delete(WriteOptions(), key)); - - true_data.erase(key); - } - - VerifyDBFromMap(true_data); - - } while (ChangeOptions(kSkipMergePut)); -} - class MergeOperatorHook : public MergeOperator { public: explicit MergeOperatorHook(std::shared_ptr _merge_op) @@ -637,6 +577,86 @@ TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) { db_->ReleaseSnapshot(snapshot2); } +class PerConfigMergeOperatorPinningTest + : public DBMergeOperatorTest, + public testing::WithParamInterface> { + public: + PerConfigMergeOperatorPinningTest() { + std::tie(disable_block_cache_, option_config_) = GetParam(); + } + + bool disable_block_cache_; +}; + +INSTANTIATE_TEST_CASE_P( + MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest, + ::testing::Combine(::testing::Bool(), + ::testing::Range(static_cast(DBTestBase::kDefault), + static_cast(DBTestBase::kEnd)))); + +TEST_P(PerConfigMergeOperatorPinningTest, Randomized) { + if (ShouldSkipOptions(option_config_, kSkipMergePut)) { + return; + } + + Options options = CurrentOptions(); + options.merge_operator = MergeOperators::CreateMaxOperator(); + BlockBasedTableOptions table_options; + table_options.no_block_cache = disable_block_cache_; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Random rnd(301); + std::map true_data; + + const int kTotalMerges = 5000; + // Every key gets ~10 operands + const int kKeyRange = kTotalMerges / 10; + const int kOperandSize = 20; + const int kNumPutBefore = kKeyRange / 10; // 10% value + const int kNumPutAfter = kKeyRange / 10; // 10% overwrite + const int kNumDelete = kKeyRange / 10; // 10% delete + + // kNumPutBefore keys will have base values + for (int i = 0; i < kNumPutBefore; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Do kTotalMerges merges + for (int i = 0; i < kTotalMerges; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Merge(WriteOptions(), key, value)); + + if (true_data[key] < value) { + true_data[key] = value; + } + } + + // Overwrite random kNumPutAfter keys + for (int i = 0; i < kNumPutAfter; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + std::string value = RandomString(&rnd, kOperandSize); + ASSERT_OK(db_->Put(WriteOptions(), key, value)); + + true_data[key] = value; + } + + // Delete random kNumDelete keys + for (int i = 0; i < kNumDelete; i++) { + std::string key = Key(rnd.Next() % kKeyRange); + ASSERT_OK(db_->Delete(WriteOptions(), key)); + + true_data.erase(key); + } + + VerifyDBFromMap(true_data); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/db_options_test.cc b/db/db_options_test.cc index a7ecf12744b..cb031e62eb4 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -11,7 +11,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "options/options_helper.h" #include "port/stack_trace.h" @@ -19,14 +19,12 @@ #include "rocksdb/convenience.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/stats_history.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" #include "util/random.h" -#include "util/sync_point.h" -#include "util/testutil.h" namespace rocksdb { -const int kMicrosInSec = 1000000; - class DBOptionsTest : public DBTestBase { public: DBOptionsTest() : DBTestBase("/db_options_test") {} @@ -66,10 +64,10 @@ class DBOptionsTest : public DBTestBase { std::unordered_map GetRandomizedMutableCFOptionsMap( Random* rnd) { - Options options; + Options options = CurrentOptions(); options.env = env_; ImmutableDBOptions db_options(options); - test::RandomInitCFOptions(&options, rnd); + test::RandomInitCFOptions(&options, options, rnd); auto sanitized_options = SanitizeOptions(db_options, options); auto opt_map = GetMutableCFOptionsMap(sanitized_options); delete options.compaction_filter; @@ -507,10 +505,10 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) { options.stats_dump_period_sec = 5; options.env = env_; Reopen(options); - ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec); for (int i = 0; i < 20; i++) { - int num = rand() % 5000 + 1; + unsigned int num = rand() % 5000 + 1; ASSERT_OK( dbfull()->SetDBOptions({{"stats_dump_period_sec", ToString(num)}})); ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec); @@ -518,283 +516,18 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) { Close(); } -TEST_F(DBOptionsTest, RunStatsDumpPeriodSec) { - Options options; - options.create_if_missing = true; - options.stats_dump_period_sec = 5; - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); - int counter = 0; - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::DumpStats:1", [&](void* /*arg*/) { - counter++; - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Reopen(options); - ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec); - dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); }); - ASSERT_GE(counter, 1); - - // Test cacel job through SetOptions - ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}})); - int old_val = counter; - for (int i = 6; i < 20; ++i) { - dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); }); - } - ASSERT_EQ(counter, old_val); - Close(); -} - -// Test persistent stats background thread scheduling and cancelling -TEST_F(DBOptionsTest, StatsPersistScheduling) { - Options options; - options.create_if_missing = true; - options.stats_persist_period_sec = 5; - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG - int counter = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Reopen(options); - ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec); - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); - ASSERT_GE(counter, 1); - - // Test cacel job through SetOptions - ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled()); - ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); - ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled()); - Close(); -} - -// Test enabling persistent stats for the first time -TEST_F(DBOptionsTest, PersistentStatsFreshInstall) { - Options options; - options.create_if_missing = true; - options.stats_persist_period_sec = 0; - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); -#endif // OS_MACOSX && !NDEBUG - int counter = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - Reopen(options); - ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}})); - ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec); - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); - ASSERT_GE(counter, 1); - Close(); -} - TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) { Options options; options.create_if_missing = true; options.stats_persist_period_sec = 5; options.env = env_; Reopen(options); - ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec); ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}})); - ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec); + ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec); ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}})); - ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec); -} - -TEST_F(DBOptionsTest, GetStatsHistory) { - Options options; - options.create_if_missing = true; - options.stats_persist_period_sec = 5; - options.statistics = rocksdb::CreateDBStatistics(); - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); -#endif // OS_MACOSX && !NDEBUG - - CreateColumnFamilies({"pikachu"}, options); - ASSERT_OK(Put("foo", "bar")); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - - int mock_time = 1; - // Wait for stats persist to finish - dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); - std::unique_ptr stats_iter; - db_->GetStatsHistory(0, 6 * kMicrosInSec, &stats_iter); - ASSERT_TRUE(stats_iter != nullptr); - // disabled stats snapshots - ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); - size_t stats_count = 0; - for (; stats_iter->Valid(); stats_iter->Next()) { - auto stats_map = stats_iter->GetStatsMap(); - stats_count += stats_map.size(); - } - ASSERT_GT(stats_count, 0); - // Wait a bit and verify no more stats are found - for (mock_time = 6; mock_time < 20; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } - db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter); - ASSERT_TRUE(stats_iter != nullptr); - size_t stats_count_new = 0; - for (; stats_iter->Valid(); stats_iter->Next()) { - stats_count_new += stats_iter->GetStatsMap().size(); - } - ASSERT_EQ(stats_count_new, stats_count); - Close(); -} - -TEST_F(DBOptionsTest, InMemoryStatsHistoryPurging) { - Options options; - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - options.stats_persist_period_sec = 1; - std::unique_ptr mock_env; - mock_env.reset(new rocksdb::MockTimeEnv(env_)); - mock_env->set_current_time(0); // in seconds - options.env = mock_env.get(); -#if defined(OS_MACOSX) && !defined(NDEBUG) - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { - uint64_t time_us = *reinterpret_cast(arg); - if (time_us < mock_env->RealNowMicros()) { - *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; - } - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); -#endif // OS_MACOSX && !NDEBUG - - CreateColumnFamilies({"pikachu"}, options); - ASSERT_OK(Put("foo", "bar")); - ReopenWithColumnFamilies({"default", "pikachu"}, options); - // some random operation to populate statistics - ASSERT_OK(Delete("foo")); - ASSERT_OK(Put("sol", "sol")); - ASSERT_OK(Put("epic", "epic")); - ASSERT_OK(Put("ltd", "ltd")); - ASSERT_EQ("sol", Get("sol")); - ASSERT_EQ("epic", Get("epic")); - ASSERT_EQ("ltd", Get("ltd")); - Iterator* iterator = db_->NewIterator(ReadOptions()); - for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { - ASSERT_TRUE(iterator->key() == iterator->value()); - } - delete iterator; - ASSERT_OK(Flush()); - ASSERT_OK(Delete("sol")); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - int mock_time = 1; - // Wait for stats persist to finish - for (; mock_time < 5; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } - - // second round of ops - ASSERT_OK(Put("saigon", "saigon")); - ASSERT_OK(Put("noodle talk", "noodle talk")); - ASSERT_OK(Put("ping bistro", "ping bistro")); - iterator = db_->NewIterator(ReadOptions()); - for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { - ASSERT_TRUE(iterator->key() == iterator->value()); - } - delete iterator; - ASSERT_OK(Flush()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - for (; mock_time < 10; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } - std::unique_ptr stats_iter; - db_->GetStatsHistory(0, 10 * kMicrosInSec, &stats_iter); - ASSERT_TRUE(stats_iter != nullptr); - size_t stats_count = 0; - int slice_count = 0; - for (; stats_iter->Valid(); stats_iter->Next()) { - slice_count++; - auto stats_map = stats_iter->GetStatsMap(); - stats_count += stats_map.size(); - } - size_t stats_history_size = dbfull()->TEST_EstiamteStatsHistorySize(); - ASSERT_GE(slice_count, 9); - ASSERT_GE(stats_history_size, 12000); - // capping memory cost at 12000 bytes since one slice is around 10000~12000 - ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}})); - ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size); - // Wait for stats persist to finish - for (; mock_time < 20; ++mock_time) { - dbfull()->TEST_WaitForPersistStatsRun( - [&] { mock_env->set_current_time(mock_time); }); - } - db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter); - ASSERT_TRUE(stats_iter != nullptr); - size_t stats_count_reopen = 0; - slice_count = 0; - for (; stats_iter->Valid(); stats_iter->Next()) { - slice_count++; - auto stats_map = stats_iter->GetStatsMap(); - stats_count_reopen += stats_map.size(); - } - size_t stats_history_size_reopen = dbfull()->TEST_EstiamteStatsHistorySize(); - // only one slice can fit under the new stats_history_buffer_size - ASSERT_LT(slice_count, 2); - ASSERT_TRUE(stats_history_size_reopen < 12000 && - stats_history_size_reopen > 0); - ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0); - Close(); + ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec); } static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) { @@ -873,6 +606,76 @@ TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) { ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate); } +TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + + options.ttl = 0; + options.periodic_compaction_seconds = 0; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 0; + options.periodic_compaction_seconds = 100; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 100; + options.periodic_compaction_seconds = 0; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 100; + options.periodic_compaction_seconds = 500; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds); +} + +TEST_F(DBOptionsTest, SanitizeTtlDefault) { + Options options; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.compaction_style = kCompactionStyleLevel; + options.ttl = 0; + Reopen(options); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); + + options.ttl = 100; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); +} + +TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.ttl = 0; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.ttl = 100; + Reopen(options); + ASSERT_EQ(100, dbfull()->GetOptions().ttl); + + options.ttl = 100 * 24 * 60 * 60; + Reopen(options); + ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + options.ttl = 200; + options.periodic_compaction_seconds = 300; + Reopen(options); + ASSERT_EQ(200, dbfull()->GetOptions().ttl); + + options.ttl = 500; + options.periodic_compaction_seconds = 300; + Reopen(options); + ASSERT_EQ(300, dbfull()->GetOptions().ttl); +} + TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { Options options; options.compaction_style = kCompactionStyleFIFO; @@ -1007,6 +810,53 @@ TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) { ASSERT_EQ(256, env_->compaction_readahead_size_); Close(); } + +TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) { + Options options; + options.compaction_style = kCompactionStyleFIFO; + options.write_buffer_size = 10 << 10; // 10KB + options.create_if_missing = true; + + ASSERT_OK(TryReopen(options)); + + Random rnd(301); + for (int i = 0; i < 10; i++) { + // Generate and flush a file about 10KB. + for (int j = 0; j < 10; j++) { + ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980))); + } + Flush(); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + + // In release 6.0, ttl was promoted from a secondary level option under + // compaction_options_fifo to a top level option under ColumnFamilyOptions. + // We still need to handle old SetOptions calls but should ignore + // ttl under compaction_options_fifo. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"}, + {"ttl", "60"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 60); + + // Put ttl as the first option inside compaction_options_fifo. That works as + // it doesn't overwrite any other option. + ASSERT_OK(dbfull()->SetOptions( + {{"compaction_options_fifo", + "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"}, + {"ttl", "191"}})); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, + true); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, + 1024); + ASSERT_EQ(dbfull()->GetOptions().ttl, 191); +} + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 1a988f5ea4c..57206f6edb2 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -210,12 +210,11 @@ void VerifySimilar(uint64_t a, uint64_t b, double bias) { } } -void VerifyTableProperties(const TableProperties& base_tp, - const TableProperties& new_tp, - double filter_size_bias = 0.1, - double index_size_bias = 0.1, - double data_size_bias = 0.1, - double num_data_blocks_bias = 0.05) { +void VerifyTableProperties( + const TableProperties& base_tp, const TableProperties& new_tp, + double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1, + double index_size_bias = 0.1, double data_size_bias = 0.1, + double num_data_blocks_bias = 0.05) { VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias); VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias); VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias); @@ -266,7 +265,8 @@ void GetExpectedTableProperties( // discount 1 byte as value size is not encoded in value delta encoding (value_delta_encoding ? 1 : 0)); expected_tp->filter_size = - kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); + kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 + + /*average-ish overhead*/ CACHE_LINE_SIZE / 2); } } // anonymous namespace @@ -615,8 +615,9 @@ TEST_F(DBPropertiesTest, NumImmutableMemTable) { writeOpt.disableWAL = true; options.max_write_buffer_number = 4; options.min_write_buffer_number_to_merge = 3; - options.max_write_buffer_number_to_maintain = 4; options.write_buffer_size = 1000000; + options.max_write_buffer_size_to_maintain = + 5 * static_cast(options.write_buffer_size); CreateAndReopenWithCF({"pikachu"}, options); std::string big_value(1000000 * 2, 'x'); @@ -747,7 +748,7 @@ TEST_F(DBPropertiesTest, DISABLED_GetProperty) { options.max_background_flushes = 1; options.max_write_buffer_number = 10; options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 0; + options.max_write_buffer_size_to_maintain = 0; options.write_buffer_size = 1000000; Reopen(options); @@ -997,7 +998,7 @@ TEST_F(DBPropertiesTest, EstimatePendingCompBytes) { options.max_background_flushes = 1; options.max_write_buffer_number = 10; options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 0; + options.max_write_buffer_size_to_maintain = 0; options.write_buffer_size = 1000000; Reopen(options); @@ -1630,7 +1631,11 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) { // Test with empty block cache. constexpr size_t kCapacity = 100; - auto block_cache = NewLRUCache(kCapacity, 0 /*num_shard_bits*/); + LRUCacheOptions co; + co.capacity = kCapacity; + co.num_shard_bits = 0; + co.metadata_charge_policy = kDontChargeCacheMetadata; + auto block_cache = NewLRUCache(co); table_options.block_cache = block_cache; table_options.no_block_cache = false; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index aa63286f60a..ec448b731e0 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -5,7 +5,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" -#include "util/testutil.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -491,6 +491,30 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) { ASSERT_EQ(expected, actual); } +TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) { + // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4) + // Flush. The `CompactionIterator` previously had a bug where we forgot to + // check for covering range tombstones when processing the (1) Put, causing + // it to reappear after the flush. + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateUInt64AddOperator(); + Reopen(opts); + + std::string val; + PutFixed64(&val, 1); + ASSERT_OK(db_->Put(WriteOptions(), "key", val)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + "key", "key_")); + ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); + ASSERT_OK(db_->Flush(FlushOptions())); + + ReadOptions read_opts; + std::string expected, actual; + ASSERT_OK(db_->Get(read_opts, "key", &actual)); + PutFixed64(&expected, 1); + ASSERT_EQ(expected, actual); +} + // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) { @@ -1497,6 +1521,84 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { ASSERT_EQ(1, num_range_deletions); } +TEST_F(DBRangeDelTest, OverlappedTombstones) { + const int kNumPerFile = 4, kNumFiles = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(RandomString(&rnd, 3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key((kNumFiles)*kNumPerFile + 1))); + ASSERT_OK(db_->Flush(FlushOptions())); + + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + + // The tombstone range is not broken up into multiple SSTs which may incur a + // large compaction with L2. + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + std::vector> files; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); +} + +TEST_F(DBRangeDelTest, OverlappedKeys) { + const int kNumPerFile = 4, kNumFiles = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(RandomString(&rnd, 3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + + for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) { + ASSERT_OK(Put(Key(i), "0x123")); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // The key range is broken up into three SSTs to avoid a future big compaction + // with the grandparent + dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + + std::vector> files; + dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); +} + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 9003ed6b1ac..37adee46722 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -8,10 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_test_util.h" +#include "file/sst_file_manager_impl.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_manager.h" -#include "util/sst_file_manager_impl.h" namespace rocksdb { @@ -430,6 +430,7 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { env_->time_elapse_only_sleep_ = true; Options options = CurrentOptions(); options.disable_auto_compactions = true; + options.compression = kNoCompression; options.env = env_; int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec @@ -439,7 +440,7 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { ASSERT_OK(s); options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); auto sfm = static_cast(options.sst_file_manager.get()); - sfm->delete_scheduler()->SetMaxTrashDBRatio(2.1); + sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); ASSERT_OK(TryReopen(options)); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); @@ -469,6 +470,111 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +class DBWALTestWithParam + : public DBSSTTest, + public testing::WithParamInterface> { + public: + DBWALTestWithParam() { + wal_dir_ = std::get<0>(GetParam()); + wal_dir_same_as_dbname_ = std::get<1>(GetParam()); + } + + std::string wal_dir_; + bool wal_dir_same_as_dbname_; +}; + +TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) { + class MyEnv : public EnvWrapper { + public: + MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {} + + Status DeleteFile(const std::string& fname) { + if (fname.find(".log.trash") != std::string::npos && fake_log_delete) { + return Status::OK(); + } + + return target()->DeleteFile(fname); + } + + void set_fake_log_delete(bool fake) { fake_log_delete = fake; } + + private: + bool fake_log_delete; + }; + + std::unique_ptr env(new MyEnv(Env::Default())); + Destroy(last_options_); + + env->set_fake_log_delete(true); + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.compression = kNoCompression; + options.env = env.get(); + options.wal_dir = dbname_ + wal_dir_; + + int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec + Status s; + options.sst_file_manager.reset( + NewSstFileManager(env_, nullptr, "", 0, false, &s, 0)); + ASSERT_OK(s); + options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1); + + ASSERT_OK(TryReopen(options)); + + // Create 4 files in L0 + for (char v = 'a'; v <= 'd'; v++) { + ASSERT_OK(Put("Key2", DummyString(1024, v))); + ASSERT_OK(Put("Key3", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Put("Key1", DummyString(1024, v))); + ASSERT_OK(Put("Key4", DummyString(1024, v))); + ASSERT_OK(Flush()); + } + // We created 4 sst files in L0 + ASSERT_EQ("4", FilesPerLevel(0)); + + Close(); + + options.sst_file_manager.reset(); + std::vector filenames; + int trash_log_count = 0; + if (!wal_dir_same_as_dbname_) { + // Forcibly create some trash log files + std::unique_ptr result; + env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result, + EnvOptions()); + result.reset(); + } + env->GetChildren(options.wal_dir, &filenames); + for (const std::string& fname : filenames) { + if (fname.find(".log.trash") != std::string::npos) { + trash_log_count++; + } + } + ASSERT_GE(trash_log_count, 1); + + env->set_fake_log_delete(false); + ASSERT_OK(TryReopen(options)); + + filenames.clear(); + trash_log_count = 0; + env->GetChildren(options.wal_dir, &filenames); + for (const std::string& fname : filenames) { + if (fname.find(".log.trash") != std::string::npos) { + trash_log_count++; + } + } + ASSERT_EQ(trash_log_count, 0); + Close(); +} + +INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam, + ::testing::Values(std::make_tuple("", true), + std::make_tuple("_wal_dir", false))); + TEST_F(DBSSTTest, OpenDBWithExistingTrash) { Options options = CurrentOptions(); diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index 5a54fd81c05..164042bc277 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -14,8 +14,8 @@ #include "port/stack_trace.h" #include "rocksdb/db.h" #include "rocksdb/utilities/table_properties_collectors.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #ifndef ROCKSDB_LITE @@ -139,12 +139,12 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { Options options; options.create_if_missing = true; options.write_buffer_size = 4096; - options.max_write_buffer_number = 3; + options.max_write_buffer_number = 2; options.level0_file_num_compaction_trigger = 2; options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; + options.level0_stop_writes_trigger = 2; options.target_file_size_base = 2048; - options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_base = 40960; options.max_bytes_for_level_multiplier = 4; options.hard_pending_compaction_bytes_limit = 16 * 1024; options.num_levels = 8; @@ -230,7 +230,7 @@ TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) { // Create one table per CF, then verify it was created with the column family // name property. - for (int cf = 0; cf < 2; ++cf) { + for (uint32_t cf = 0; cf < 2; ++cf) { Put(cf, "key", "val"); Flush(cf); diff --git a/db/db_test.cc b/db/db_test.cc index 8a112e48fcd..16d1a4deebe 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -24,13 +24,15 @@ #endif #include "cache/lru_cache.h" -#include "db/db_impl.h" +#include "db/blob_index.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/dbformat.h" #include "db/job_context.h" #include "db/version_set.h" #include "db/write_batch_internal.h" #include "env/mock_env.h" +#include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/thread_status_util.h" #include "port/port.h" @@ -53,19 +55,17 @@ #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/compression.h" -#include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -883,7 +883,7 @@ TEST_F(DBTest, FlushMultipleMemtable) { writeOpt.disableWAL = true; options.max_write_buffer_number = 4; options.min_write_buffer_number_to_merge = 3; - options.max_write_buffer_number_to_maintain = -1; + options.max_write_buffer_size_to_maintain = -1; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); ASSERT_OK(Flush(1)); @@ -901,7 +901,8 @@ TEST_F(DBTest, FlushSchedule) { options.level0_stop_writes_trigger = 1 << 10; options.level0_slowdown_writes_trigger = 1 << 10; options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 1; + options.max_write_buffer_size_to_maintain = + static_cast(options.write_buffer_size); options.max_write_buffer_number = 2; options.write_buffer_size = 120 * 1024; CreateAndReopenWithCF({"pikachu"}, options); @@ -1019,39 +1020,149 @@ TEST_F(DBTest, FailMoreDbPaths) { ASSERT_TRUE(TryReopen(options).IsNotSupported()); } -void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) { +void CheckColumnFamilyMeta( + const ColumnFamilyMetaData& cf_meta, + const std::vector>& files_by_level, + uint64_t start_time, uint64_t end_time) { + ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName); + ASSERT_EQ(cf_meta.levels.size(), files_by_level.size()); + uint64_t cf_size = 0; - uint64_t cf_csize = 0; size_t file_count = 0; - for (auto level_meta : cf_meta.levels) { + + for (size_t i = 0; i < cf_meta.levels.size(); ++i) { + const auto& level_meta_from_cf = cf_meta.levels[i]; + const auto& level_meta_from_files = files_by_level[i]; + + ASSERT_EQ(level_meta_from_cf.level, i); + ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size()); + + file_count += level_meta_from_cf.files.size(); + uint64_t level_size = 0; - uint64_t level_csize = 0; - file_count += level_meta.files.size(); - for (auto file_meta : level_meta.files) { - level_size += file_meta.size; + for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) { + const auto& file_meta_from_cf = level_meta_from_cf.files[j]; + const auto& file_meta_from_files = level_meta_from_files[j]; + + level_size += file_meta_from_cf.size; + + ASSERT_EQ(file_meta_from_cf.file_number, + file_meta_from_files.fd.GetNumber()); + ASSERT_EQ(file_meta_from_cf.file_number, + TableFileNameToNumber(file_meta_from_cf.name)); + ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size); + ASSERT_EQ(file_meta_from_cf.smallest_seqno, + file_meta_from_files.fd.smallest_seqno); + ASSERT_EQ(file_meta_from_cf.largest_seqno, + file_meta_from_files.fd.largest_seqno); + ASSERT_EQ(file_meta_from_cf.smallestkey, + file_meta_from_files.smallest.user_key().ToString()); + ASSERT_EQ(file_meta_from_cf.largestkey, + file_meta_from_files.largest.user_key().ToString()); + ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number, + file_meta_from_files.oldest_blob_file_number); + ASSERT_EQ(file_meta_from_cf.oldest_ancester_time, + file_meta_from_files.oldest_ancester_time); + ASSERT_EQ(file_meta_from_cf.file_creation_time, + file_meta_from_files.file_creation_time); + ASSERT_GE(file_meta_from_cf.file_creation_time, start_time); + ASSERT_LE(file_meta_from_cf.file_creation_time, end_time); + ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time); + ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time); } - ASSERT_EQ(level_meta.size, level_size); + + ASSERT_EQ(level_meta_from_cf.size, level_size); cf_size += level_size; - cf_csize += level_csize; } + ASSERT_EQ(cf_meta.file_count, file_count); ASSERT_EQ(cf_meta.size, cf_size); } +void CheckLiveFilesMeta( + const std::vector& live_file_meta, + const std::vector>& files_by_level) { + size_t total_file_count = 0; + for (const auto& f : files_by_level) { + total_file_count += f.size(); + } + + ASSERT_EQ(live_file_meta.size(), total_file_count); + + int level = 0; + int i = 0; + + for (const auto& meta : live_file_meta) { + if (level != meta.level) { + level = meta.level; + i = 0; + } + + ASSERT_LT(i, files_by_level[level].size()); + + const auto& expected_meta = files_by_level[level][i]; + + ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName); + ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber()); + ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name)); + ASSERT_EQ(meta.size, expected_meta.fd.file_size); + ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno); + ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno); + ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString()); + ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString()); + ASSERT_EQ(meta.oldest_blob_file_number, + expected_meta.oldest_blob_file_number); + + ++i; + } +} + #ifndef ROCKSDB_LITE -TEST_F(DBTest, ColumnFamilyMetaDataTest) { +TEST_F(DBTest, MetaDataTest) { Options options = CurrentOptions(); options.create_if_missing = true; + options.disable_auto_compactions = true; + + int64_t temp_time = 0; + options.env->GetCurrentTime(&temp_time); + uint64_t start_time = static_cast(temp_time); + DestroyAndReopen(options); Random rnd(301); int key_index = 0; - ColumnFamilyMetaData cf_meta; for (int i = 0; i < 100; ++i) { - GenerateNewFile(&rnd, &key_index); - db_->GetColumnFamilyMetaData(&cf_meta); - CheckColumnFamilyMeta(cf_meta); + // Add a single blob reference to each file + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000, + /* offset */ 1234, /* size */ 5678, kNoCompression); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index), + blob_index)); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + + ++key_index; + + // Fill up the rest of the file with random values. + GenerateNewFile(&rnd, &key_index, /* nowait */ true); + + Flush(); } + + std::vector> files_by_level; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level); + + options.env->GetCurrentTime(&temp_time); + uint64_t end_time = static_cast(temp_time); + + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(&cf_meta); + CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time); + + std::vector live_file_meta; + db_->GetLiveFilesMetaData(&live_file_meta); + CheckLiveFilesMeta(live_file_meta, files_by_level); } namespace { @@ -1257,6 +1368,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) { options.compression = kNoCompression; options.create_if_missing = true; DestroyAndReopen(options); + auto default_cf = db_->DefaultColumnFamily(); const int N = 128; Random rnd(301); @@ -1268,9 +1380,10 @@ TEST_F(DBTest, ApproximateSizesMemTable) { std::string start = Key(50); std::string end = Key(60); Range r(start, end); - uint8_t include_both = DB::SizeApproximationFlags::INCLUDE_FILES | - DB::SizeApproximationFlags::INCLUDE_MEMTABLES; - db_->GetApproximateSizes(&r, 1, &size, include_both); + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = true; + size_approx_options.include_files = true; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); ASSERT_LT(size, 204800); // Zero if not including mem table @@ -1280,7 +1393,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); for (int i = 0; i < N; i++) { @@ -1290,19 +1403,20 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(500); end = Key(600); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(100); end = Key(1020); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); options.max_write_buffer_number = 8; options.min_write_buffer_number_to_merge = 5; options.write_buffer_size = 1024 * N; // Not very large DestroyAndReopen(options); + default_cf = db_->DefaultColumnFamily(); int keys[N * 3]; for (int i = 0; i < N; i++) { @@ -1319,26 +1433,27 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(100); end = Key(300); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_GT(size, 6000); start = Key(2100); end = Key(2300); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); ASSERT_EQ(size, 0); start = Key(1050); end = Key(1080); r = Range(start, end); uint64_t size_with_mt, size_without_mt; - db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt); ASSERT_GT(size_with_mt, 6000); db_->GetApproximateSizes(&r, 1, &size_without_mt); ASSERT_EQ(size_without_mt, 0); @@ -1352,10 +1467,80 @@ TEST_F(DBTest, ApproximateSizesMemTable) { start = Key(1050); end = Key(1080); r = Range(start, end); - db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both); + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, + &size_with_mt); db_->GetApproximateSizes(&r, 1, &size_without_mt); ASSERT_GT(size_with_mt, size_without_mt); ASSERT_GT(size_without_mt, 6000); + + // Check that include_memtabtles flag works as expected + size_approx_options.include_memtabtles = false; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_EQ(size, size_without_mt); + + // Check that files_size_error_margin works as expected, when the heuristic + // conditions are not met + start = Key(1); + end = Key(1000 + N - 2); + r = Range(start, end); + size_approx_options.files_size_error_margin = -1.0; // disabled + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + uint64_t size2; + size_approx_options.files_size_error_margin = 0.5; // enabled, but not used + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_EQ(size, size2); +} + +TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) { + Options options = CurrentOptions(); + options.write_buffer_size = 1024 * 1024; + options.compression = kNoCompression; + options.create_if_missing = true; + options.target_file_size_base = 1024 * 1024; + DestroyAndReopen(options); + const auto default_cf = db_->DefaultColumnFamily(); + + const int N = 64000; + Random rnd(301); + for (int i = 0; i < N; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + // Flush everything to files + Flush(); + // Compact the entire key space into the next level + db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr); + + // Write more keys + for (int i = N; i < (N + N / 4); i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + // Flush everything to files again + Flush(); + + // Wait for compaction to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + const std::string start = Key(0); + const std::string end = Key(2 * N); + const Range r(start, end); + + SizeApproximationOptions size_approx_options; + size_approx_options.include_memtabtles = false; + size_approx_options.include_files = true; + size_approx_options.files_size_error_margin = -1.0; // disabled + + // Get the precise size without any approximation heuristic + uint64_t size; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size); + ASSERT_NE(size, 0); + + // Get the size with an approximation heuristic + uint64_t size2; + const double error_margin = 0.2; + size_approx_options.files_size_error_margin = error_margin; + db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2); + ASSERT_LT(size2, size * (1 + error_margin)); + ASSERT_GT(size2, size * (1 - error_margin)); } TEST_F(DBTest, GetApproximateMemTableStats) { @@ -2285,6 +2470,7 @@ class MultiThreadedDBTest }; TEST_P(MultiThreadedDBTest, MultiThreaded) { + if (option_config_ == kPipelinedWrite) return; anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; Options options = CurrentOptions(options_override); @@ -2465,6 +2651,15 @@ class ModelDB : public DB { return Status::NotSupported(key); } + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, + const Slice& key, PinnableSlice* /*slice*/, + GetMergeOperandsOptions* /*merge_operands_options*/, + int* /*number_of_operands*/) override { + return Status::NotSupported(key); + } + using DB::MultiGet; std::vector MultiGet( const ReadOptions& /*options*/, @@ -2491,7 +2686,18 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented"); } - Status VerifyChecksum() override { + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const ExportImportFilesMetaData& /*metadata*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not implemented."); + } + + using DB::VerifyChecksum; + Status VerifyChecksum(const ReadOptions&) override { return Status::NotSupported("Not implemented."); } @@ -2587,13 +2793,14 @@ class ModelDB : public DB { return false; } using DB::GetApproximateSizes; - void GetApproximateSizes(ColumnFamilyHandle* /*column_family*/, - const Range* /*range*/, int n, uint64_t* sizes, - uint8_t /*include_flags*/ - = INCLUDE_FILES) override { + Status GetApproximateSizes(const SizeApproximationOptions& /*options*/, + ColumnFamilyHandle* /*column_family*/, + const Range* /*range*/, int n, + uint64_t* sizes) override { for (int i = 0; i < n; i++) { sizes[i] = 0; } + return Status::OK(); } using DB::GetApproximateMemTableStats; void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/, @@ -2641,6 +2848,10 @@ class ModelDB : public DB { return Status::NotSupported("Not supported operation."); } + void EnableManualCompaction() override { return; } + + void DisableManualCompaction() override { return; } + using DB::NumberLevels; int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; } @@ -2693,6 +2904,16 @@ class ModelDB : public DB { return Status::OK(); } + Status GetCurrentWalFile( + std::unique_ptr* /*current_log_file*/) override { + return Status::OK(); + } + + virtual Status GetCreationTimeOfOldestFile( + uint64_t* /*creation_time*/) override { + return Status::NotSupported(); + } + Status DeleteFile(std::string /*name*/) override { return Status::OK(); } Status GetUpdatesSince( @@ -3114,10 +3335,10 @@ TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) { options.create_if_missing = true; options.ttl = 600; // seconds - // Check that it is not supported with max_open_files != -1. + // TTL is now supported with max_open_files != -1. options.max_open_files = 100; options = CurrentOptions(options); - ASSERT_TRUE(TryReopen(options).IsNotSupported()); + ASSERT_OK(TryReopen(options)); options.max_open_files = -1; ASSERT_OK(TryReopen(options)); @@ -4624,6 +4845,7 @@ TEST_F(DBTest, DynamicCompactionOptions) { // Even more FIFOCompactionTests are at DBTest.FIFOCompaction* . TEST_F(DBTest, DynamicFIFOCompactionOptions) { Options options; + options.ttl = 0; options.create_if_missing = true; DestroyAndReopen(options); @@ -4689,15 +4911,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) { DestroyAndReopen(options); // Initial defaults - ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U); ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, - 2); + 2u); ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, UINT_MAX); ASSERT_EQ(dbfull() ->GetOptions() .compaction_options_universal.max_size_amplification_percent, - 200); + 200u); ASSERT_EQ(dbfull() ->GetOptions() .compaction_options_universal.compression_size_percent, @@ -4710,15 +4932,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) { ASSERT_OK(dbfull()->SetOptions( {{"compaction_options_universal", "{size_ratio=7;}"}})); - ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u); ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, - 2); + 2u); ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, UINT_MAX); ASSERT_EQ(dbfull() ->GetOptions() .compaction_options_universal.max_size_amplification_percent, - 200); + 200u); ASSERT_EQ(dbfull() ->GetOptions() .compaction_options_universal.compression_size_percent, @@ -4731,15 +4953,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) { ASSERT_OK(dbfull()->SetOptions( {{"compaction_options_universal", "{min_merge_width=11;}"}})); - ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7); + ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u); ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width, - 11); + 11u); ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width, UINT_MAX); ASSERT_EQ(dbfull() ->GetOptions() .compaction_options_universal.max_size_amplification_percent, - 200); + 200u); ASSERT_EQ(dbfull() ->GetOptions() .compaction_options_universal.compression_size_percent, @@ -4884,11 +5106,15 @@ TEST_F(DBTest, DynamicMiscOptions) { ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], &mutable_cf_options)); ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression); - ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}})); - ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], - &mutable_cf_options)); - ASSERT_EQ(CompressionType::kSnappyCompression, - mutable_cf_options.compression); + + if (Snappy_Supported()) { + ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}})); + ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0], + &mutable_cf_options)); + ASSERT_EQ(CompressionType::kSnappyCompression, + mutable_cf_options.compression); + } + // Test paranoid_file_checks already done in db_block_cache_test ASSERT_OK( dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}})); @@ -5973,6 +6199,19 @@ TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { } } +TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) { + Options options = CurrentOptions(); + options.max_open_files = 100; + Reopen(options); + + ColumnFamilyOptions cf_options(options); + // ttl is now supported when max_open_files is -1. + cf_options.ttl = 3600; + ColumnFamilyHandle* handle; + ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle)); + delete handle; +} + #ifndef ROCKSDB_LITE TEST_F(DBTest, RowCache) { Options options = CurrentOptions(); @@ -6138,10 +6377,140 @@ TEST_F(DBTest, ThreadLocalPtrDeadlock) { fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n", flushes_done.load(), threads_destroyed.load()); } + +TEST_F(DBTest, LargeBlockSizeTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put(0, "foo", "bar")); + BlockBasedTableOptions table_options; + table_options.block_size = 8LL * 1024 * 1024 * 1024LL; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} + +#ifndef ROCKSDB_LITE + +TEST_F(DBTest, CreationTimeOfOldestFile) { + const int kNumKeysPerFile = 32; + const int kNumLevelFiles = 2; + const int kValueSize = 100; + + Options options = CurrentOptions(); + options.max_open_files = -1; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + bool set_file_creation_time_to_zero = true; + int idx = 0; + + int64_t time_1 = 0; + env_->GetCurrentTime(&time_1); + const uint64_t uint_time_1 = static_cast(time_1); + + // Add 50 hours + env_->addon_time_.fetch_add(50 * 60 * 60); + + int64_t time_2 = 0; + env_->GetCurrentTime(&time_2); + const uint64_t uint_time_2 = static_cast(time_2); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) { + TableProperties* props = reinterpret_cast(arg); + if (set_file_creation_time_to_zero) { + if (idx == 0) { + props->file_creation_time = 0; + idx++; + } else if (idx == 1) { + props->file_creation_time = uint_time_1; + idx = 0; + } + } else { + if (idx == 0) { + props->file_creation_time = uint_time_1; + idx++; + } else if (idx == 1) { + props->file_creation_time = uint_time_2; + } + } + }); + // Set file creation time in manifest all to 0. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "FileMetaData::FileMetaData", [&](void* arg) { + FileMetaData* meta = static_cast(arg); + meta->file_creation_time = 0; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + + // At this point there should be 2 files, one with file_creation_time = 0 and + // the other non-zero. GetCreationTimeOfOldestFile API should return 0. + uint64_t creation_time; + Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time); + ASSERT_EQ(0, creation_time); + ASSERT_EQ(s1, Status::OK()); + + // Testing with non-zero file creation time. + set_file_creation_time_to_zero = false; + options = CurrentOptions(); + options.max_open_files = -1; + env_->time_elapse_only_sleep_ = false; + options.env = env_; + + env_->addon_time_.store(0); + DestroyAndReopen(options); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + } + Flush(); + } + + // At this point there should be 2 files with non-zero file creation time. + // GetCreationTimeOfOldestFile API should return non-zero value. + uint64_t ctime; + Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime); + ASSERT_EQ(uint_time_1, ctime); + ASSERT_EQ(s2, Status::OK()); + + // Testing with max_open_files != -1 + options = CurrentOptions(); + options.max_open_files = 10; + DestroyAndReopen(options); + Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime); + ASSERT_EQ(s3, Status::NotSupported()); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif + } // namespace rocksdb +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + int main(int argc, char** argv) { rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/db_test2.cc b/db/db_test2.cc index 75e7fe4abba..6b0ee157e4c 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -16,6 +16,7 @@ #include "port/stack_trace.h" #include "rocksdb/persistent_cache.h" #include "rocksdb/wal_filter.h" +#include "test_util/fault_injection_test_env.h" namespace rocksdb { @@ -1036,8 +1037,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { ASSERT_TRUE(index == keys_cf.size()); } -// Temporarily disable it because the test is flaky. -TEST_F(DBTest2, DISABLED_PresetCompressionDict) { +TEST_F(DBTest2, PresetCompressionDict) { // Verifies that compression ratio improves when dictionary is enabled, and // improves even further when the dictionary is trained by ZSTD. const size_t kBlockSizeBytes = 4 << 10; @@ -1046,7 +1046,8 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) { const int kNumL0Files = 5; Options options; - options.env = CurrentOptions().env; // Make sure to use any custom env that the test is configured with. + // Make sure to use any custom env that the test is configured with. + options.env = CurrentOptions().env; options.allow_concurrent_memtable_write = false; options.arena_block_size = kBlockSizeBytes; options.create_if_missing = true; @@ -1072,10 +1073,19 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) { compression_types.push_back(kZSTD); } + enum DictionaryTypes : int { + kWithoutDict, + kWithDict, + kWithZSTDTrainedDict, + kDictEnd, + }; + for (auto compression_type : compression_types) { options.compression = compression_type; - size_t prev_out_bytes; - for (int i = 0; i < 3; ++i) { + size_t bytes_without_dict = 0; + size_t bytes_with_dict = 0; + size_t bytes_with_zstd_trained_dict = 0; + for (int i = kWithoutDict; i < kDictEnd; i++) { // First iteration: compress without preset dictionary // Second iteration: compress with preset dictionary // Third iteration (zstd only): compress with zstd-trained dictionary @@ -1085,19 +1095,19 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) { // the non-first iterations, verify the data we get out is the same data // we put in. switch (i) { - case 0: + case kWithoutDict: options.compression_opts.max_dict_bytes = 0; options.compression_opts.zstd_max_train_bytes = 0; break; - case 1: - options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes; + case kWithDict: + options.compression_opts.max_dict_bytes = kBlockSizeBytes; options.compression_opts.zstd_max_train_bytes = 0; break; - case 2: + case kWithZSTDTrainedDict: if (compression_type != kZSTD) { continue; } - options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes; + options.compression_opts.max_dict_bytes = kBlockSizeBytes; options.compression_opts.zstd_max_train_bytes = kL0FileBytes; break; default: @@ -1129,23 +1139,32 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) { ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); - size_t out_bytes = 0; - std::vector files; - GetSstFiles(env_, dbname_, &files); - for (const auto& file : files) { - uint64_t curr_bytes; - env_->GetFileSize(dbname_ + "/" + file, &curr_bytes); - out_bytes += static_cast(curr_bytes); + // Get the live sst files size + size_t total_sst_bytes = TotalSize(1); + if (i == kWithoutDict) { + bytes_without_dict = total_sst_bytes; + } else if (i == kWithDict) { + bytes_with_dict = total_sst_bytes; + } else if (i == kWithZSTDTrainedDict) { + bytes_with_zstd_trained_dict = total_sst_bytes; } for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes); j++) { ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast(j)))); } - if (i) { - ASSERT_GT(prev_out_bytes, out_bytes); + if (i == kWithDict) { + ASSERT_GT(bytes_without_dict, bytes_with_dict); + } else if (i == kWithZSTDTrainedDict) { + // In zstd compression, it is sometimes possible that using a trained + // dictionary does not get as good a compression ratio as without + // training. + // But using a dictionary (with or without training) should always get + // better compression ratio than not using one. + ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict || + bytes_without_dict > bytes_with_zstd_trained_dict); } - prev_out_bytes = out_bytes; + DestroyAndReopen(options); } } @@ -1923,7 +1942,10 @@ TEST_F(DBTest2, TestPerfContextIterCpuTime) { } #endif // OS_LINUX -#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented +// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache +// breaks when that function is not implemented and no regular block cache is +// provided. +#if !defined(OS_SOLARIS) && !defined(OS_WIN) TEST_F(DBTest2, PersistentCache) { int num_iter = 80; @@ -1987,7 +2009,7 @@ TEST_F(DBTest2, PersistentCache) { } } } -#endif // !OS_SOLARIS +#endif // !defined(OS_SOLARIS) && !defined(OS_WIN) namespace { void CountSyncPoint() { @@ -2384,6 +2406,215 @@ TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBTest2, PausingManualCompaction1) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + DestroyAndReopen(options); + Random rnd(301); + // Generate a file containing 10 keys. + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + // Generate another file containing same keys + for (int i = 0; i < 10; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + int manual_compactions_paused = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) { + auto paused = reinterpret_cast*>(arg); + ASSERT_FALSE(paused->load(std::memory_order_acquire)); + paused->store(true, std::memory_order_release); + manual_compactions_paused += 1; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector files_before_compact, files_after_compact; + // Remember file name before compaction is triggered + std::vector files_meta; + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_before_compact.push_back(file.name); + } + + // OK, now trigger a manual compaction + dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Wait for compactions to get scheduled and stopped + dbfull()->TEST_WaitForCompact(true); + + // Get file names after compaction is stopped + files_meta.clear(); + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_after_compact.push_back(file.name); + } + + // Like nothing happened + ASSERT_EQ(files_before_compact, files_after_compact); + ASSERT_EQ(manual_compactions_paused, 1); + + manual_compactions_paused = 0; + // Now make sure CompactFiles also not run + dbfull()->CompactFiles(rocksdb::CompactionOptions(), files_before_compact, 0); + // Wait for manual compaction to get scheduled and finish + dbfull()->TEST_WaitForCompact(true); + + files_meta.clear(); + files_after_compact.clear(); + dbfull()->GetLiveFilesMetaData(&files_meta); + for (auto file : files_meta) { + files_after_compact.push_back(file.name); + } + + ASSERT_EQ(files_before_compact, files_after_compact); + // CompactFiles returns at entry point + ASSERT_EQ(manual_compactions_paused, 0); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +// PausingManualCompaction does not affect auto compaction +TEST_F(DBTest2, PausingManualCompaction2) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = false; + + DestroyAndReopen(options); + dbfull()->DisableManualCompaction(); + + Random rnd(301); + for (int i = 0; i < 2; i++) { + // Generate a file containing 10 keys. + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + + std::vector files_meta; + dbfull()->GetLiveFilesMetaData(&files_meta); + ASSERT_EQ(files_meta.size(), 1); +} + +TEST_F(DBTest2, PausingManualCompaction3) { + CompactRangeOptions compact_options; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50))); + } + Flush(); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + int run_manual_compactions = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:1", + [&](void* /*arg*/) { run_manual_compactions++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + dbfull()->DisableManualCompaction(); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); + // As manual compaction disabled, not even reach sync point + ASSERT_EQ(run_manual_compactions, 0); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + rocksdb::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:1"); + dbfull()->EnableManualCompaction(); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, PausingManualCompaction4) { + CompactRangeOptions compact_options; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + + Random rnd(301); + auto generate_files = [&]() { + for (int i = 0; i < options.num_levels; i++) { + for (int j = 0; j < options.num_levels - i + 1; j++) { + for (int k = 0; k < 1000; k++) { + ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50))); + } + Flush(); + } + + for (int l = 1; l < options.num_levels - i; l++) { + MoveFilesToLevel(l); + } + } + }; + + DestroyAndReopen(options); + generate_files(); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + int run_manual_compactions = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) { + auto paused = reinterpret_cast*>(arg); + ASSERT_FALSE(paused->load(std::memory_order_acquire)); + paused->store(true, std::memory_order_release); + run_manual_compactions++; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + dbfull()->EnableManualCompaction(); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); + ASSERT_EQ(run_manual_compactions, 1); +#ifndef ROCKSDB_LITE + ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + rocksdb::SyncPoint::GetInstance()->ClearCallBack( + "CompactionJob::Run():PausingManualCompaction:2"); + dbfull()->EnableManualCompaction(); + dbfull()->CompactRange(compact_options, nullptr, nullptr); + dbfull()->TEST_WaitForCompact(true); +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); +#endif // !ROCKSDB_LITE + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBTest2, OptimizeForPointLookup) { Options options = CurrentOptions(); Close(); @@ -2779,8 +3010,12 @@ TEST_F(DBTest2, ReadCallbackTest) { ReadOptions roptions; TestReadCallback callback(seq); bool dont_care = true; - Status s = dbfull()->GetImpl(roptions, dbfull()->DefaultColumnFamily(), key, - &pinnable_val, &dont_care, &callback); + DBImpl::GetImplOptions get_impl_options; + get_impl_options.column_family = dbfull()->DefaultColumnFamily(); + get_impl_options.value = &pinnable_val; + get_impl_options.value_found = &dont_care; + get_impl_options.callback = &callback; + Status s = dbfull()->GetImpl(roptions, key, get_impl_options); ASSERT_TRUE(s.ok()); // Assuming that after each Put the DB increased seq by one, the value and // seq number must be equal since we also inc value by 1 after each Put. @@ -3738,10 +3973,256 @@ TEST_F(DBTest2, OldStatsInterface) { ASSERT_GT(dos->num_rt, 0); ASSERT_GT(dos->num_mt, 0); } + +TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { + const Snapshot* ss = db_->GetSnapshot(); + + for (auto h : handles_) { + db_->DestroyColumnFamilyHandle(h); + } + handles_.clear(); + + ASSERT_NOK(db_->Close()); + db_->ReleaseSnapshot(ss); + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; +} + +TEST_F(DBTest2, PrefixBloomReseek) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + ASSERT_OK(Put("bbb1", "")); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Seeking into f1, the iterator will check bloom filter which returns the + // file iterator ot be invalidate, and the cursor will put into f2, with + // the next key to be "ddd0". + iter->Seek("bbb1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbb1", iter->key().ToString()); + + // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek. + iter->Seek("ccc1"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ccc1", iter->key().ToString()); + + delete iter; +} + +TEST_F(DBTest2, PrefixBloomFilteredOut) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Construct two L1 files with keys: + // f1:[aaa1 ccc1] f2:[ddd0] + ASSERT_OK(Put("aaa1", "")); + ASSERT_OK(Put("ccc1", "")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("ddd0", "")); + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + Iterator* iter = db_->NewIterator(ReadOptions()); + + // Bloom filter is filterd out by f1. + // This is just one of several valid position following the contract. + // Postioning to ccc1 or ddd0 is also valid. This is just to validate + // the behavior of the current implementation. If underlying implementation + // changes, the test might fail here. + iter->Seek("bbb1"); + ASSERT_FALSE(iter->Valid()); + + delete iter; +} + +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, RowCacheSnapshot) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + options.row_cache = NewLRUCache(8 * 8192); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo", "bar1")); + + const Snapshot* s1 = db_->GetSnapshot(); + + ASSERT_OK(Put("foo", "bar2")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("foo2", "bar")); + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_OK(Put("foo3", "bar")); + const Snapshot* s3 = db_->GetSnapshot(); + + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s2), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s1), "bar1"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + ASSERT_EQ(Get("foo", s3), "bar2"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + + db_->ReleaseSnapshot(s1); + db_->ReleaseSnapshot(s2); + db_->ReleaseSnapshot(s3); +} +#endif // ROCKSDB_LITE + +// When DB is reopened with multiple column families, the manifest file +// is written after the first CF is flushed, and it is written again +// after each flush. If DB crashes between the flushes, the flushed CF +// flushed will pass the latest log file, and now we require it not +// to be corrupted, and triggering a corruption report. +// We need to fix the bug and enable the test. +TEST_F(DBTest2, CrashInRecoveryMultipleCF) { + const std::vector sync_points = { + "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"}; + for (const auto& test_sync_point : sync_points) { + Options options = CurrentOptions(); + // First destroy original db to ensure a clean start. + DestroyAndReopen(options); + options.create_if_missing = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put(1, "foo", "bar")); + // The value is large enough to be divided to two blocks. + std::string large_value(400, ' '); + ASSERT_OK(Put("foo1", large_value)); + ASSERT_OK(Put("foo2", large_value)); + Close(); + + // Corrupt the log file in the middle, so that it is not corrupted + // in the tail. + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (const auto& f : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) { + std::string fname = dbname_ + "/" + f; + std::string file_content; + ASSERT_OK(ReadFileToString(env_, fname, &file_content)); + file_content[400] = 'h'; + file_content[401] = 'a'; + ASSERT_OK(WriteStringToFile(env_, file_content, fname)); + break; + } + } + + // Reopen and freeze the file system after the first manifest write. + FaultInjectionTestEnv fit_env(options.env); + options.env = &fit_env; + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + test_sync_point, + [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(TryReopenWithColumnFamilies( + {kDefaultColumnFamilyName, "pikachu"}, options)); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + fit_env.SetFilesystemActive(true); + // If we continue using failure ingestion Env, it will conplain something + // when renaming current file, which is not expected. Need to investigate + // why. + options.env = env_; + ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, + options)); + } +} + +TEST_F(DBTest2, SeekFileRangeDeleteTail) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewCappedPrefixTransform(1)); + options.num_levels = 3; + DestroyAndReopen(options); + + ASSERT_OK(Put("a", "a")); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f")); + ASSERT_OK(Put("b", "a")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("x", "a")); + ASSERT_OK(Put("z", "a")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 2; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + { + ReadOptions ro; + ro.total_order_seek = true; + std::unique_ptr iter(db_->NewIterator(ro)); + iter->Seek("e"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("x", iter->key().ToString()); + } + db_->ReleaseSnapshot(s1); +} } // namespace rocksdb +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + int main(int argc, char** argv) { rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/db_test_util.cc b/db/db_test_util.cc index bee6b81d5dd..88f57275f31 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -10,6 +10,7 @@ #include "db/db_test_util.h" #include "db/forward_iterator.h" #include "rocksdb/env_encryption.h" +#include "rocksdb/utilities/object_registry.h" namespace rocksdb { @@ -47,20 +48,30 @@ ROT13BlockCipher rot13Cipher_(16); #endif // ROCKSDB_LITE DBTestBase::DBTestBase(const std::string path) - : mem_env_(!getenv("MEM_ENV") ? nullptr : new MockEnv(Env::Default())), + : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) { + Env* base_env = Env::Default(); #ifndef ROCKSDB_LITE - encrypted_env_( - !getenv("ENCRYPTED_ENV") - ? nullptr - : NewEncryptedEnv(mem_env_ ? mem_env_ : Env::Default(), - new CTREncryptionProvider(rot13Cipher_))), -#else - encrypted_env_(nullptr), -#endif // ROCKSDB_LITE - env_(new SpecialEnv(encrypted_env_ - ? encrypted_env_ - : (mem_env_ ? mem_env_ : Env::Default()))), - option_config_(kDefault) { + const char* test_env_uri = getenv("TEST_ENV_URI"); + if (test_env_uri) { + Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri, + &env_guard_); + base_env = env_guard_.get(); + EXPECT_OK(s); + EXPECT_NE(Env::Default(), base_env); + } +#endif // !ROCKSDB_LITE + EXPECT_NE(nullptr, base_env); + if (getenv("MEM_ENV")) { + mem_env_ = new MockEnv(base_env); + } +#ifndef ROCKSDB_LITE + if (getenv("ENCRYPTED_ENV")) { + encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, + new CTREncryptionProvider(rot13Cipher_)); + } +#endif // !ROCKSDB_LITE + env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_ + : (mem_env_ ? mem_env_ : base_env)); env_->SetBackgroundThreads(1, Env::LOW); env_->SetBackgroundThreads(1, Env::HIGH); dbname_ = test::PerThreadDBPath(env_, path); @@ -341,6 +352,7 @@ Options DBTestBase::GetOptions( options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.memtable_factory.reset(NewHashSkipListRepFactory(16)); options.allow_concurrent_memtable_write = false; + options.unordered_write = false; break; case kPlainTableFirstBytePrefix: options.table_factory.reset(new PlainTableFactory()); @@ -373,12 +385,14 @@ Options DBTestBase::GetOptions( case kVectorRep: options.memtable_factory.reset(new VectorRepFactory(100)); options.allow_concurrent_memtable_write = false; + options.unordered_write = false; break; case kHashLinkList: options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.memtable_factory.reset( NewHashLinkListRepFactory(4, 0, 3, true, 4)); options.allow_concurrent_memtable_write = false; + options.unordered_write = false; break; case kDirectIO: { options.use_direct_reads = true; @@ -540,6 +554,11 @@ Options DBTestBase::GetOptions( options.manual_wal_flush = true; break; } + case kUnorderedWrite: { + options.allow_concurrent_memtable_write = false; + options.unordered_write = false; + break; + } default: break; @@ -565,7 +584,8 @@ void DBTestBase::CreateColumnFamilies(const std::vector& cfs, size_t cfi = handles_.size(); handles_.resize(cfi + cfs.size()); for (auto cf : cfs) { - ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]); + ASSERT_OK(s); } } @@ -757,7 +777,8 @@ std::string DBTestBase::Get(int cf, const std::string& k, std::vector DBTestBase::MultiGet(std::vector cfs, const std::vector& k, - const Snapshot* snapshot) { + const Snapshot* snapshot, + const bool batched) { ReadOptions options; options.verify_checksums = true; options.snapshot = snapshot; @@ -769,12 +790,30 @@ std::vector DBTestBase::MultiGet(std::vector cfs, handles.push_back(handles_[cfs[i]]); keys.push_back(k[i]); } - std::vector s = db_->MultiGet(options, handles, keys, &result); - for (unsigned int i = 0; i < s.size(); ++i) { - if (s[i].IsNotFound()) { - result[i] = "NOT_FOUND"; - } else if (!s[i].ok()) { - result[i] = s[i].ToString(); + std::vector s; + if (!batched) { + s = db_->MultiGet(options, handles, keys, &result); + for (unsigned int i = 0; i < s.size(); ++i) { + if (s[i].IsNotFound()) { + result[i] = "NOT_FOUND"; + } else if (!s[i].ok()) { + result[i] = s[i].ToString(); + } + } + } else { + std::vector pin_values(cfs.size()); + result.resize(cfs.size()); + s.resize(cfs.size()); + db_->MultiGet(options, cfs.size(), handles.data(), keys.data(), + pin_values.data(), s.data()); + for (unsigned int i = 0; i < s.size(); ++i) { + if (s[i].IsNotFound()) { + result[i] = "NOT_FOUND"; + } else if (!s[i].ok()) { + result[i] = s[i].ToString(); + } else { + result[i].assign(pin_values[i].data(), pin_values[i].size()); + } } } return result; diff --git a/db/db_test_util.h b/db/db_test_util.h index 50109e0a406..c9678ee1c3d 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -8,12 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif #include -#include +#include #include #include @@ -24,9 +21,10 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "env/mock_env.h" +#include "file/filename.h" #include "memtable/hash_linklist_rep.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" @@ -40,19 +38,18 @@ #include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/utilities/checkpoint.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" +#include "test_util/mock_time_env.h" #include "util/compression.h" -#include "util/filename.h" -#include "util/mock_time_env.h" #include "util/mutexlock.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -140,6 +137,11 @@ class SpecialMemTableRep : public MemTableRep { memtable_->Insert(handle); } + void InsertConcurrently(KeyHandle handle) override { + num_entries_++; + memtable_->Insert(handle); + } + // Returns true iff an entry that compares equal to key is in the list. virtual bool Contains(const char* key) const override { return memtable_->Contains(key); @@ -688,6 +690,7 @@ class DBTestBase : public testing::Test { kPartitionedFilterWithNewTableReaderForCompactions, kUniversalSubcompactions, kxxHash64Checksum, + kUnorderedWrite, // This must be the last line kEnd, }; @@ -699,6 +702,7 @@ class DBTestBase : public testing::Test { MockEnv* mem_env_; Env* encrypted_env_; SpecialEnv* env_; + std::shared_ptr env_guard_; DB* db_; std::vector handles_; @@ -846,7 +850,8 @@ class DBTestBase : public testing::Test { std::vector MultiGet(std::vector cfs, const std::vector& k, - const Snapshot* snapshot = nullptr); + const Snapshot* snapshot, + const bool batched); std::vector MultiGet(const std::vector& k, const Snapshot* snapshot = nullptr); @@ -983,6 +988,11 @@ class DBTestBase : public testing::Test { uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { return options.statistics->getTickerCount(ticker_type); } + + uint64_t TestGetAndResetTickerCount(const Options& options, + Tickers ticker_type) { + return options.statistics->getAndResetTickerCount(ticker_type); + } }; } // namespace rocksdb diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 2bd8af684e0..522f4a2d8b7 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -11,7 +11,7 @@ #include "port/stack_trace.h" #if !defined(ROCKSDB_LITE) #include "rocksdb/utilities/table_properties_collectors.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { @@ -41,10 +41,9 @@ class DBTestUniversalCompaction : public DBTestUniversalCompactionBase { DBTestUniversalCompactionBase("/db_universal_compaction_test") {} }; -class DBTestUniversalDeleteTrigCompaction : public DBTestBase { +class DBTestUniversalCompaction2 : public DBTestBase { public: - DBTestUniversalDeleteTrigCompaction() - : DBTestBase("/db_universal_compaction_test") {} + DBTestUniversalCompaction2() : DBTestBase("/db_universal_compaction_test2") {} }; namespace { @@ -397,7 +396,7 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) { int total_picked_compactions = 0; int total_size_amp_compactions = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) { + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { if (arg) { total_picked_compactions++; Compaction* c = static_cast(arg); @@ -441,18 +440,18 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) { ASSERT_EQ(dbfull() ->GetOptions(handles_[1]) .compaction_options_universal.max_size_amplification_percent, - 200); + 200U); ASSERT_OK(dbfull()->SetOptions(handles_[1], {{"compaction_options_universal", "{max_size_amplification_percent=110;}"}})); ASSERT_EQ(dbfull() ->GetOptions(handles_[1]) .compaction_options_universal.max_size_amplification_percent, - 110); + 110u); ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], &mutable_cf_options)); - ASSERT_EQ(110, mutable_cf_options.compaction_options_universal - .max_size_amplification_percent); + ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal + .max_size_amplification_percent); dbfull()->TEST_WaitForCompact(); // Verify that size amplification did happen @@ -478,7 +477,7 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) { int total_picked_compactions = 0; int total_size_ratio_compactions = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) { + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { if (arg) { total_picked_compactions++; Compaction* c = static_cast(arg); @@ -522,20 +521,22 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) { ASSERT_EQ(dbfull() ->GetOptions(handles_[1]) .compaction_options_universal.min_merge_width, - 2); + 2u); ASSERT_EQ(dbfull() ->GetOptions(handles_[1]) .compaction_options_universal.max_merge_width, - 2); + 2u); ASSERT_EQ( dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio, - 100); + 100u); ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1], &mutable_cf_options)); - ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100); - ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, 2); - ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, 2); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, + 2u); + ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, + 2u); dbfull()->TEST_WaitForCompact(); @@ -837,14 +838,14 @@ TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) { rocksdb::SyncPoint::GetInstance()->LoadDependency( {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0", "BackgroundCallCompaction:0"}, - {"UniversalCompactionPicker::PickCompaction:Return", + {"UniversalCompactionBuilder::PickCompaction:Return", "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"}, {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2", "CompactionJob::Run():Start"}}); int total_picked_compactions = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) { + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { if (arg) { total_picked_compactions++; } @@ -1913,7 +1914,7 @@ INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId, ::testing::Combine(::testing::Values(1, 8), ::testing::Bool())); -TEST_F(DBTestUniversalDeleteTrigCompaction, BasicL0toL1) { +TEST_F(DBTestUniversalCompaction2, BasicL0toL1) { const int kNumKeys = 3000; const int kWindowSize = 100; const int kNumDelsTrigger = 90; @@ -1954,7 +1955,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, BasicL0toL1) { ASSERT_GT(NumTableFilesAtLevel(6), 0); } -TEST_F(DBTestUniversalDeleteTrigCompaction, SingleLevel) { +TEST_F(DBTestUniversalCompaction2, SingleLevel) { const int kNumKeys = 3000; const int kWindowSize = 100; const int kNumDelsTrigger = 90; @@ -1993,7 +1994,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, SingleLevel) { ASSERT_EQ(1, NumTableFilesAtLevel(0)); } -TEST_F(DBTestUniversalDeleteTrigCompaction, MultipleLevels) { +TEST_F(DBTestUniversalCompaction2, MultipleLevels) { const int kWindowSize = 100; const int kNumDelsTrigger = 90; @@ -2065,7 +2066,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, MultipleLevels) { ASSERT_GT(NumTableFilesAtLevel(6), 0); } -TEST_F(DBTestUniversalDeleteTrigCompaction, OverlappingL0) { +TEST_F(DBTestUniversalCompaction2, OverlappingL0) { const int kWindowSize = 100; const int kNumDelsTrigger = 90; @@ -2105,7 +2106,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, OverlappingL0) { ASSERT_GT(NumTableFilesAtLevel(6), 0); } -TEST_F(DBTestUniversalDeleteTrigCompaction, IngestBehind) { +TEST_F(DBTestUniversalCompaction2, IngestBehind) { const int kNumKeys = 3000; const int kWindowSize = 100; const int kNumDelsTrigger = 90; @@ -2148,6 +2149,96 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, IngestBehind) { ASSERT_GT(NumTableFilesAtLevel(5), 0); } +TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) { + Options options; + options.compaction_style = kCompactionStyleUniversal; + + KeepFilterFactory* filter = new KeepFilterFactory(true); + options.compaction_filter_factory.reset(filter); + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + KeepFilter df; + options.compaction_filter_factory.reset(); + options.compaction_filter = &df; + Reopen(options); + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + options.ttl = 60 * 24 * 60 * 60; + options.compaction_filter = nullptr; + Reopen(options); + ASSERT_EQ(60 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); +} + +TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { + Options opts = CurrentOptions(); + opts.env = env_; + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 10; + opts.max_open_files = -1; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + opts.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + opts.num_levels = 5; + env_->addon_time_.store(0); + Reopen(opts); + + int periodic_compactions = 0; + int start_level = -1; + int output_level = -1; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", + [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(arg != nullptr); + ASSERT_TRUE(compaction->compaction_reason() == + CompactionReason::kPeriodicCompaction); + start_level = compaction->start_level(); + output_level = compaction->output_level(); + periodic_compactions++; + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // Case 1: Oldest flushed file excceeds periodic compaction threshold. + ASSERT_OK(Put("foo", "bar")); + Flush(); + ASSERT_EQ(0, periodic_compactions); + // Move clock forward so that the flushed file would qualify periodic + // compaction. + env_->addon_time_.store(48 * 60 * 60 + 100); + + // Another flush would trigger compaction the oldest file. + ASSERT_OK(Put("foo", "bar2")); + Flush(); + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ(1, periodic_compactions); + ASSERT_EQ(0, start_level); + ASSERT_EQ(4, output_level); + + // Case 2: Oldest compacted file excceeds periodic compaction threshold + periodic_compactions = 0; + // A flush doesn't trigger a periodic compaction when threshold not hit + ASSERT_OK(Put("foo", "bar2")); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(0, periodic_compactions); + + // After periodic compaction threshold hits, a flush will trigger + // a compaction + ASSERT_OK(Put("foo", "bar2")); + env_->addon_time_.fetch_add(48 * 60 * 60 + 100); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(1, periodic_compactions); + ASSERT_EQ(0, start_level); + ASSERT_EQ(4, output_level); +} + } // namespace rocksdb #endif // !defined(ROCKSDB_LITE) diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 78f72b4a0e7..4e0b08c9a5c 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -11,8 +11,8 @@ #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" -#include "util/fault_injection_test_env.h" -#include "util/sync_point.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" namespace rocksdb { class DBWALTest : public DBTestBase { @@ -569,6 +569,56 @@ TEST_F(DBWALTest, GetSortedWalFiles) { } while (ChangeWalOptions()); } +TEST_F(DBWALTest, GetCurrentWalFile) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + + std::unique_ptr* bad_log_file = nullptr; + ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file)); + + std::unique_ptr log_file; + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + // nothing has been written to the log yet + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_EQ(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + // add some data and verify that the file size actually moves foward + ASSERT_OK(Put(0, "foo", "v1")); + ASSERT_OK(Put(0, "foo2", "v2")); + ASSERT_OK(Put(0, "foo3", "v3")); + + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_GT(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + // force log files to cycle and add some more data, then check if + // log number moves forward + + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + for (int i = 0; i < 10; i++) { + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + } + + ASSERT_OK(Put(0, "foo4", "v4")); + ASSERT_OK(Put(0, "foo5", "v5")); + ASSERT_OK(Put(0, "foo6", "v6")); + + ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file)); + + ASSERT_EQ(log_file->StartSequence(), 0); + ASSERT_GT(log_file->SizeFileBytes(), 0); + ASSERT_EQ(log_file->Type(), kAliveLogFile); + ASSERT_GT(log_file->LogNumber(), 0); + + } while (ChangeWalOptions()); +} + TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) { // Test for regression of WAL cleanup missing files that don't contain data // for every column family. @@ -824,7 +874,9 @@ class RecoveryTestHelper { // Create WAL files with values filled in static void FillData(DBWALTest* test, const Options& options, const size_t wal_count, size_t* count) { - const ImmutableDBOptions db_options(options); + // Calling internal functions requires sanitized options. + Options sanitized_options = SanitizeOptions(test->dbname_, options); + const ImmutableDBOptions db_options(sanitized_options); *count = 0; @@ -838,7 +890,8 @@ class RecoveryTestHelper { versions.reset(new VersionSet(test->dbname_, &db_options, env_options, table_cache.get(), &write_buffer_manager, - &write_controller)); + &write_controller, + /*block_cache_tracer=*/nullptr)); wal_manager.reset(new WalManager(db_options, env_options)); diff --git a/db/db_write_test.cc b/db/db_write_test.cc index e6bab875114..9eca823c2b7 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -12,9 +12,9 @@ #include "db/write_thread.h" #include "port/port.h" #include "port/stack_trace.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { diff --git a/db/dbformat.cc b/db/dbformat.cc index cd2878198c4..a20e2a02d39 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -8,12 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/dbformat.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include +#include #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "util/coding.h" @@ -163,9 +159,11 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const { } } -LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) { +LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, + const Slice* ts) { size_t usize = _user_key.size(); - size_t needed = usize + 13; // A conservative estimate + size_t ts_sz = (nullptr == ts) ? 0 : ts->size(); + size_t needed = usize + ts_sz + 13; // A conservative estimate char* dst; if (needed <= sizeof(space_)) { dst = space_; @@ -174,10 +172,14 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) { } start_ = dst; // NOTE: We don't support users keys of more than 2GB :) - dst = EncodeVarint32(dst, static_cast(usize + 8)); + dst = EncodeVarint32(dst, static_cast(usize + ts_sz + 8)); kstart_ = dst; memcpy(dst, _user_key.data(), usize); dst += usize; + if (nullptr != ts) { + memcpy(dst, ts->data(), ts_sz); + dst += ts_sz; + } EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); dst += 8; end_ = dst; diff --git a/db/dbformat.h b/db/dbformat.h index c850adcb01a..090d8c133f3 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -14,6 +14,7 @@ #include #include "db/lookup_key.h" #include "db/merge_context.h" +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -23,11 +24,16 @@ #include "rocksdb/table.h" #include "rocksdb/types.h" #include "util/coding.h" -#include "util/logging.h" #include "util/user_comparator_wrapper.h" namespace rocksdb { +// The file declares data structures and functions that deal with internal +// keys. +// Each internal key contains a user key, a sequence number (SequenceNumber) +// and a type (ValueType), and they are usually encoded together. +// There are some related helper classes here. + class InternalKey; // Value types encoded as the last component of internal keys. @@ -88,6 +94,8 @@ static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1); static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64; +// The data structure that represents an internal key in the way that user_key, +// sequence number and type are stored in separated forms. struct ParsedInternalKey { Slice user_key; SequenceNumber sequence; @@ -143,6 +151,17 @@ inline Slice ExtractUserKey(const Slice& internal_key) { return Slice(internal_key.data(), internal_key.size() - 8); } +inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, + size_t ts_sz) { + assert(internal_key.size() >= 8 + ts_sz); + return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz); +} + +inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { + assert(user_key.size() >= ts_sz); + return Slice(user_key.data(), user_key.size() - ts_sz); +} + inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { assert(internal_key.size() >= 8); const size_t n = internal_key.size(); @@ -192,9 +211,7 @@ class InternalKeyComparator } }; -// Modules in this directory should keep internal keys wrapped inside -// the following class instead of plain strings so that we do not -// incorrectly use string comparisons instead of an InternalKeyComparator. +// The class represent the internal key in encoded form. class InternalKey { private: std::string rep_; @@ -295,6 +312,12 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { return num >> 8; } +// The class to store keys in an efficient way. It allows: +// 1. Users can either copy the key into it, or have it point to an unowned +// address. +// 2. For copied key, a short inline buffer is kept to reduce memory +// allocation for smaller keys. +// 3. It tracks user key or internal key, and allow conversion between them. class IterKey { public: IterKey() @@ -303,6 +326,9 @@ class IterKey { key_size_(0), buf_size_(sizeof(space_)), is_user_key_(true) {} + // No copying allowed + IterKey(const IterKey&) = delete; + void operator=(const IterKey&) = delete; ~IterKey() { ResetBuffer(); } @@ -500,12 +526,10 @@ class IterKey { } void EnlargeBuffer(size_t key_size); - - // No copying allowed - IterKey(const IterKey&) = delete; - void operator=(const IterKey&) = delete; }; +// Convert from a SliceTranform of user keys, to a SliceTransform of +// user keys. class InternalKeySliceTransform : public SliceTransform { public: explicit InternalKeySliceTransform(const SliceTransform* transform) @@ -631,6 +655,7 @@ inline int InternalKeyComparator::CompareKeySeq(const Slice& akey, return r; } +// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey. struct ParsedInternalKeyComparator { explicit ParsedInternalKeyComparator(const InternalKeyComparator* c) : cmp(c) {} diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 0b16c13f573..9ec1bc34348 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -8,8 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/dbformat.h" -#include "util/logging.h" -#include "util/testharness.h" +#include "logging/logging.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 3ae464c5842..db6f945a7db 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -13,72 +13,42 @@ #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "file/filename.h" +#include "port/stack_trace.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" -#include "util/filename.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { -class DeleteFileTest : public testing::Test { +class DeleteFileTest : public DBTestBase { public: - std::string dbname_; - Options options_; - DB* db_; - Env* env_; - int numlevels_; - - DeleteFileTest() { - db_ = nullptr; - env_ = Env::Default(); - options_.delete_obsolete_files_period_micros = 0; // always do full purge - options_.enable_thread_tracking = true; - options_.write_buffer_size = 1024*1024*1000; - options_.target_file_size_base = 1024*1024*1000; - options_.max_bytes_for_level_base = 1024*1024*1000; - options_.WAL_ttl_seconds = 300; // Used to test log files - options_.WAL_size_limit_MB = 1024; // Used to test log files - dbname_ = test::PerThreadDBPath("deletefile_test"); - options_.wal_dir = dbname_ + "/wal_files"; - - // clean up all the files that might have been there before - std::vector old_files; - env_->GetChildren(dbname_, &old_files); - for (auto file : old_files) { - env_->DeleteFile(dbname_ + "/" + file); - } - env_->GetChildren(options_.wal_dir, &old_files); - for (auto file : old_files) { - env_->DeleteFile(options_.wal_dir + "/" + file); - } - - DestroyDB(dbname_, options_); - numlevels_ = 7; - EXPECT_OK(ReopenDB(true)); - } - - Status ReopenDB(bool create) { - delete db_; - if (create) { - DestroyDB(dbname_, options_); - } - db_ = nullptr; - options_.create_if_missing = create; - Status s = DB::Open(options_, dbname_, &db_); - assert(db_); - return s; - } - - void CloseDB() { - delete db_; - db_ = nullptr; + const int numlevels_; + const std::string wal_dir_; + + DeleteFileTest() + : DBTestBase("/deletefile_test"), + numlevels_(7), + wal_dir_(dbname_ + "/wal_files") {} + + void SetOptions(Options* options) { + assert(options); + options->delete_obsolete_files_period_micros = 0; // always do full purge + options->enable_thread_tracking = true; + options->write_buffer_size = 1024 * 1024 * 1000; + options->target_file_size_base = 1024 * 1024 * 1000; + options->max_bytes_for_level_base = 1024 * 1024 * 1000; + options->WAL_ttl_seconds = 300; // Used to test log files + options->WAL_size_limit_MB = 1024; // Used to test log files + options->wal_dir = wal_dir_; } void AddKeys(int numkeys, int startkey = 0) { @@ -120,23 +90,20 @@ class DeleteFileTest : public testing::Test { void CreateTwoLevels() { AddKeys(50000, 10000); - DBImpl* dbi = reinterpret_cast(db_); - ASSERT_OK(dbi->TEST_FlushMemTable()); - ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); for (int i = 0; i < 2; ++i) { - ASSERT_OK(dbi->TEST_CompactRange(i, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr)); } AddKeys(50000, 10000); - ASSERT_OK(dbi->TEST_FlushMemTable()); - ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); - ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); } - void CheckFileTypeCounts(std::string& dir, - int required_log, - int required_sst, - int required_manifest) { + void CheckFileTypeCounts(const std::string& dir, int required_log, + int required_sst, int required_manifest) { std::vector filenames; env_->GetChildren(dir, &filenames); @@ -167,6 +134,12 @@ class DeleteFileTest : public testing::Test { }; TEST_F(DeleteFileTest, AddKeysAndQueryLevels) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + CreateTwoLevels(); std::vector metadata; db_->GetLiveFilesMetaData(&metadata); @@ -208,15 +181,19 @@ TEST_F(DeleteFileTest, AddKeysAndQueryLevels) { // Lowest level file deletion should succeed. ASSERT_OK(db_->DeleteFile(level2file)); - - CloseDB(); } TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + CreateTwoLevels(); // there should be only one (empty) log file because CreateTwoLevels() // flushes the memtables to disk - CheckFileTypeCounts(options_.wal_dir, 1, 0, 0); + CheckFileTypeCounts(wal_dir_, 1, 0, 0); // 2 ssts, 1 manifest CheckFileTypeCounts(dbname_, 0, 2, 1); std::string first("0"), last("999999"); @@ -229,7 +206,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { CheckFileTypeCounts(dbname_, 0, 1, 1); // this time, we keep an iterator alive - ReopenDB(true); + Reopen(options); Iterator *itr = nullptr; CreateTwoLevels(); itr = db_->NewIterator(ReadOptions()); @@ -239,11 +216,15 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { delete itr; // 1 sst after iterator deletion CheckFileTypeCounts(dbname_, 0, 1, 1); - - CloseDB(); } TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + std::string first("0"), last("999999"); CompactRangeOptions compact_options; compact_options.change_level = true; @@ -253,9 +234,9 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { // We keep an iterator alive Iterator* itr = nullptr; CreateTwoLevels(); - ReadOptions options; - options.background_purge_on_iterator_cleanup = true; - itr = db_->NewIterator(options); + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + itr = db_->NewIterator(read_options); db_->CompactRange(compact_options, &first_slice, &last_slice); // 3 sst after compaction with live iterator CheckFileTypeCounts(dbname_, 0, 3, 1); @@ -277,13 +258,19 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { sleeping_task_after.WaitUntilDone(); // 1 sst after iterator deletion CheckFileTypeCounts(dbname_, 0, 1, 1); - - CloseDB(); } TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + auto do_test = [&](bool bg_purge) { ColumnFamilyOptions co; + co.max_write_buffer_size_to_maintain = + static_cast(co.write_buffer_size); WriteOptions wo; FlushOptions fo; ColumnFamilyHandle* cfh = nullptr; @@ -305,6 +292,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { &sleeping_task_after, Env::Priority::HIGH); // If background purge is enabled, the file should still be there. CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1); + TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1"); // Execute background purges. sleeping_task_after.WakeUp(); @@ -318,19 +306,31 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { do_test(false); } - options_.avoid_unnecessary_blocking_io = true; - ASSERT_OK(ReopenDB(false)); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DeleteFileTest::BackgroundPurgeCFDropTest:1", + "DBImpl::BGWorkPurge:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + options.avoid_unnecessary_blocking_io = true; + options.create_if_missing = false; + Reopen(options); { SCOPED_TRACE("avoid_unnecessary_blocking_io = true"); do_test(true); } - - CloseDB(); } // This test is to reproduce a bug that read invalid ReadOption in iterator // cleanup function TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + std::string first("0"), last("999999"); CompactRangeOptions compact_options; compact_options.change_level = true; @@ -340,12 +340,13 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { // We keep an iterator alive Iterator* itr = nullptr; CreateTwoLevels(); - ReadOptions* options = new ReadOptions(); - options->background_purge_on_iterator_cleanup = true; - itr = db_->NewIterator(*options); - // ReadOptions is deleted, but iterator cleanup function should not be - // affected - delete options; + { + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + itr = db_->NewIterator(read_options); + // ReadOptions is deleted, but iterator cleanup function should not be + // affected + } db_->CompactRange(compact_options, &first_slice, &last_slice); // 3 sst after compaction with live iterator @@ -361,11 +362,15 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { sleeping_task_after.WaitUntilDone(); // 1 sst after iterator deletion CheckFileTypeCounts(dbname_, 0, 1, 1); - - CloseDB(); } TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + std::string first("0"), last("999999"); CompactRangeOptions compact_options; compact_options.change_level = true; @@ -374,15 +379,16 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) { // We keep an iterator alive CreateTwoLevels(); - ReadOptions options; - options.background_purge_on_iterator_cleanup = true; - Iterator* itr1 = db_->NewIterator(options); + ReadOptions read_options; + read_options.background_purge_on_iterator_cleanup = true; + Iterator* itr1 = db_->NewIterator(read_options); CreateTwoLevels(); - Iterator* itr2 = db_->NewIterator(options); + Iterator* itr2 = db_->NewIterator(read_options); db_->CompactRange(compact_options, &first_slice, &last_slice); // 5 sst files after 2 compactions with 2 live iterators CheckFileTypeCounts(dbname_, 0, 5, 1); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); // ~DBImpl should wait until all BGWorkPurge are finished rocksdb::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"}, @@ -394,24 +400,29 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) { env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH); delete itr2; env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH); - CloseDB(); + Close(); TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"); // 1 sst after iterator deletion CheckFileTypeCounts(dbname_, 0, 1, 1); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DeleteFileTest, DeleteFileWithIterator) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + CreateTwoLevels(); - ReadOptions options; - Iterator* it = db_->NewIterator(options); + ReadOptions read_options; + Iterator* it = db_->NewIterator(read_options); std::vector metadata; db_->GetLiveFilesMetaData(&metadata); - std::string level2file = ""; + std::string level2file; - ASSERT_EQ((int)metadata.size(), 2); + ASSERT_EQ(metadata.size(), static_cast(2)); if (metadata[0].level == 1) { level2file = metadata[1].name; } else { @@ -430,10 +441,15 @@ TEST_F(DeleteFileTest, DeleteFileWithIterator) { } ASSERT_EQ(numKeysIterated, 50000); delete it; - CloseDB(); } TEST_F(DeleteFileTest, DeleteLogFiles) { + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + AddKeys(10, 0); VectorLogPtr logfiles; db_->GetSortedWalFiles(logfiles); @@ -442,11 +458,11 @@ TEST_F(DeleteFileTest, DeleteLogFiles) { // Should not succeed because live logs are not allowed to be deleted std::unique_ptr alive_log = std::move(logfiles.back()); ASSERT_EQ(alive_log->Type(), kAliveLogFile); - ASSERT_OK(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName())); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); fprintf(stdout, "Deleting alive log file %s\n", alive_log->PathName().c_str()); ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok()); - ASSERT_OK(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName())); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName())); logfiles.clear(); // Call Flush to bring about a new working log file and add more keys @@ -460,43 +476,36 @@ TEST_F(DeleteFileTest, DeleteLogFiles) { ASSERT_GT(logfiles.size(), 0UL); std::unique_ptr archived_log = std::move(logfiles.front()); ASSERT_EQ(archived_log->Type(), kArchivedLogFile); - ASSERT_OK( - env_->FileExists(options_.wal_dir + "/" + archived_log->PathName())); + ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName())); fprintf(stdout, "Deleting archived log file %s\n", archived_log->PathName().c_str()); ASSERT_OK(db_->DeleteFile(archived_log->PathName())); - ASSERT_EQ(Status::NotFound(), env_->FileExists(options_.wal_dir + "/" + - archived_log->PathName())); - CloseDB(); + ASSERT_EQ(Status::NotFound(), + env_->FileExists(wal_dir_ + "/" + archived_log->PathName())); } TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { - CloseDB(); - DBOptions db_options; - db_options.create_if_missing = true; - db_options.create_missing_column_families = true; - std::vector column_families; - column_families.emplace_back(); - column_families.emplace_back("new_cf", ColumnFamilyOptions()); - - std::vector handles; - rocksdb::DB* db; - ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db)); + Options options = CurrentOptions(); + SetOptions(&options); + Destroy(options); + options.create_if_missing = true; + Reopen(options); + CreateAndReopenWithCF({"new_cf"}, options); Random rnd(5); for (int i = 0; i < 1000; ++i) { - ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10), - test::RandomKey(&rnd, 10))); + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); } - ASSERT_OK(db->Flush(FlushOptions(), handles[1])); + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); for (int i = 0; i < 1000; ++i) { - ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10), - test::RandomKey(&rnd, 10))); + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); } - ASSERT_OK(db->Flush(FlushOptions(), handles[1])); + ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); std::vector metadata; - db->GetLiveFilesMetaData(&metadata); + db_->GetLiveFilesMetaData(&metadata); ASSERT_EQ(2U, metadata.size()); ASSERT_EQ("new_cf", metadata[0].column_family_name); ASSERT_EQ("new_cf", metadata[1].column_family_name); @@ -506,11 +515,11 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno ? metadata[0].name : metadata[1].name; - ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument()); - ASSERT_OK(db->DeleteFile(old_file)); + ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument()); + ASSERT_OK(db_->DeleteFile(old_file)); { - std::unique_ptr itr(db->NewIterator(ReadOptions(), handles[1])); + std::unique_ptr itr(db_->NewIterator(ReadOptions(), handles_[1])); int count = 0; for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { ASSERT_OK(itr->status()); @@ -519,13 +528,11 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { ASSERT_EQ(count, 1000); } - delete handles[0]; - delete handles[1]; - delete db; + Close(); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options); - ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db)); { - std::unique_ptr itr(db->NewIterator(ReadOptions(), handles[1])); + std::unique_ptr itr(db_->NewIterator(ReadOptions(), handles_[1])); int count = 0; for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { ASSERT_OK(itr->status()); @@ -533,16 +540,22 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { } ASSERT_EQ(count, 1000); } - - delete handles[0]; - delete handles[1]; - delete db; } } //namespace rocksdb +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/error_handler.cc b/db/error_handler.cc index afec14edcbe..9e1bf5cc107 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -4,9 +4,9 @@ // (found in the LICENSE.Apache file in the root directory). // #include "db/error_handler.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/event_helpers.h" -#include "util/sst_file_manager_impl.h" +#include "file/sst_file_manager_impl.h" namespace rocksdb { diff --git a/db/error_handler_test.cc b/db/error_handler_test.cc index d33e19df5d5..c18706fc28e 100644 --- a/db/error_handler_test.cc +++ b/db/error_handler_test.cc @@ -12,9 +12,9 @@ #include "port/stack_trace.h" #include "rocksdb/perf_context.h" #include "rocksdb/sst_file_manager.h" -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #if !defined(ROCKSDB_LITE) -#include "util/sync_point.h" +#include "test_util/sync_point.h" #endif namespace rocksdb { diff --git a/db/event_helpers.cc b/db/event_helpers.cc index f1b4b6417ed..f5345c75507 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -70,8 +70,8 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( const std::vector>& listeners, const std::string& db_name, const std::string& cf_name, const std::string& file_path, int job_id, const FileDescriptor& fd, - const TableProperties& table_properties, TableFileCreationReason reason, - const Status& s) { + uint64_t oldest_blob_file_number, const TableProperties& table_properties, + TableFileCreationReason reason, const Status& s) { if (s.ok() && event_logger) { JSONWriter jwriter; AppendCurrentTime(&jwriter); @@ -106,7 +106,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( << "num_entries" << table_properties.num_entries << "num_deletions" << table_properties.num_deletions << "num_merge_operands" << table_properties.num_merge_operands - << "num_range_deletions" << table_properties.num_merge_operands + << "num_range_deletions" << table_properties.num_range_deletions << "format_version" << table_properties.format_version << "fixed_key_len" << table_properties.fixed_key_len << "filter_policy" << table_properties.filter_policy_name @@ -129,6 +129,11 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( } jwriter.EndObject(); } + + if (oldest_blob_file_number != kInvalidBlobFileNumber) { + jwriter << "oldest_blob_file_number" << oldest_blob_file_number; + } + jwriter.EndObject(); event_logger->Log(jwriter); diff --git a/db/event_helpers.h b/db/event_helpers.h index ea35b4b5b19..820eb09be4b 100644 --- a/db/event_helpers.h +++ b/db/event_helpers.h @@ -10,9 +10,9 @@ #include "db/column_family.h" #include "db/version_edit.h" +#include "logging/event_logger.h" #include "rocksdb/listener.h" #include "rocksdb/table_properties.h" -#include "util/event_logger.h" namespace rocksdb { @@ -34,8 +34,8 @@ class EventHelpers { const std::vector>& listeners, const std::string& db_name, const std::string& cf_name, const std::string& file_path, int job_id, const FileDescriptor& fd, - const TableProperties& table_properties, TableFileCreationReason reason, - const Status& s); + uint64_t oldest_blob_file_number, const TableProperties& table_properties, + TableFileCreationReason reason, const Status& s); static void LogAndNotifyTableFileDeletion( EventLogger* event_logger, int job_id, uint64_t file_number, const std::string& file_path, diff --git a/db/experimental.cc b/db/experimental.cc index d509a37bf2e..0c3c3335d92 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -5,7 +5,7 @@ #include "rocksdb/experimental.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { namespace experimental { diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 256db0728bf..43a003a85cc 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -9,7 +9,8 @@ #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" -#include "util/testutil.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/testutil.h" namespace rocksdb { @@ -20,6 +21,7 @@ class ExternalSSTFileBasicTest public: ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") { sst_files_dir_ = dbname_ + "/sst_files/"; + fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default())); DestroyAndRecreateExternalSSTFilesDir(); } @@ -140,6 +142,7 @@ class ExternalSSTFileBasicTest protected: std::string sst_files_dir_; + std::unique_ptr fault_injection_test_env_; }; TEST_F(ExternalSSTFileBasicTest, Basic) { @@ -689,6 +692,110 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(ExternalSSTFileBasicTest, SyncFailure) { + Options options; + options.create_if_missing = true; + options.env = fault_injection_test_env_.get(); + + std::vector> test_cases = { + {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile", + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"}, + {"ExternalSstFileIngestionJob::BeforeSyncDir", + "ExternalSstFileIngestionJob::AfterSyncDir"}, + {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno", + "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}}; + + for (size_t i = 0; i < test_cases.size(); i++) { + SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) { + fault_injection_test_env_->SetFilesystemActive(false); + }); + SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) { + fault_injection_test_env_->SetFilesystemActive(true); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + if (i == 2) { + ASSERT_OK(Put("foo", "v1")); + } + + Options sst_file_writer_options; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "sync_failure_test_" + ToString(i) + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + if (i == 0) { + ingest_opt.move_files = true; + } + const Snapshot* snapshot = db_->GetSnapshot(); + if (i == 2) { + ingest_opt.write_global_seqno = true; + } + ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok()); + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); + } +} + +TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) { + Options options; + options.create_if_missing = true; + SpecialEnv senv(Env::Default()); + options.env = &senv; + DestroyAndReopen(options); + + Options sst_file_writer_options; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + Random rnd(301); + std::string value = DBTestBase::RandomString(&rnd, 4000); + for (int i = 0; i < 5000; i++) { + ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value)); + } + ASSERT_OK(sst_file_writer->Finish()); + + // Ingest it once without verifying checksums to see the baseline + // preads. + IngestExternalFileOptions ingest_opt; + ingest_opt.move_files = false; + senv.count_random_reads_ = true; + senv.random_read_bytes_counter_ = 0; + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + + auto base_num_reads = senv.random_read_counter_.Read(); + // Make sure the counter is enabled. + ASSERT_GT(base_num_reads, 0); + + // Ingest again and observe the reads made for for readahead. + ingest_opt.move_files = false; + ingest_opt.verify_checksums_before_ingest = true; + ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024}; + + senv.count_random_reads_ = true; + senv.random_read_bytes_counter_ = 0; + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + + // Make sure the counter is enabled. + ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0); + + // The SST file is about 20MB. Readahead size is 2MB. + // Give a conservative 15 reads for metadata blocks, the number + // of random reads should be within 20 MB / 2MB + 15 = 25. + ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40); + + Destroy(options); +} + TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { int kNumLevels = 7; Options options = CurrentOptions(); @@ -779,6 +886,48 @@ TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1)); } +TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) { + Options options = CurrentOptions(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // file8.sst (delete 300 => 400) + std::string file8 = sst_files_dir_ + "file8.sst"; + ASSERT_OK(sst_file_writer.Open(file8)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400))); + ExternalSstFileInfo file8_info; + Status s = sst_file_writer.Finish(&file8_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file8_info.file_path, file8); + ASSERT_EQ(file8_info.num_entries, 0); + ASSERT_EQ(file8_info.smallest_key, ""); + ASSERT_EQ(file8_info.largest_key, ""); + ASSERT_EQ(file8_info.num_range_del_entries, 1); + ASSERT_EQ(file8_info.smallest_range_del_key, Key(300)); + ASSERT_EQ(file8_info.largest_range_del_key, Key(400)); + + // file9.sst (delete 400 => 500) + std::string file9 = sst_files_dir_ + "file9.sst"; + ASSERT_OK(sst_file_writer.Open(file9)); + ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500))); + ExternalSstFileInfo file9_info; + s = sst_file_writer.Finish(&file9_info); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(file9_info.file_path, file9); + ASSERT_EQ(file9_info.num_entries, 0); + ASSERT_EQ(file9_info.smallest_key, ""); + ASSERT_EQ(file9_info.largest_key, ""); + ASSERT_EQ(file9_info.num_range_del_entries, 1); + ASSERT_EQ(file9_info.smallest_range_del_key, Key(400)); + ASSERT_EQ(file9_info.largest_range_del_key, Key(500)); + + // Range deletion tombstones are exclusive on their end key, so these SSTs + // should not be considered as overlapping. + s = DeprecatedAddFile({file8, file9}); + ASSERT_TRUE(s.ok()) << s.ToString(); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + DestroyAndRecreateExternalSSTFilesDir(); +} + TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) { bool change_checksum_called = false; const auto& change_checksum = [&](void* arg) { @@ -921,6 +1070,47 @@ TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) { } while (ChangeOptionsForFileIngestionTest()); } +TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { + Options options = CurrentOptions(); + + std::vector files; + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("a", "z")); + ASSERT_OK(sst_file_writer.Put("i", "m")); + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + files.push_back(std::move(file1)); + } + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.Put("i", "k")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + } + + IngestExternalFileOptions ifo; + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_EQ(Get("a"), "z"); + ASSERT_EQ(Get("i"), "k"); + + int total_keys = 0; + Iterator* iter = db_->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + total_keys++; + } + delete iter; + ASSERT_EQ(total_keys, 2); + + ASSERT_EQ(2, NumTableFilesAtLevel(0)); +} + INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest, testing::Values(std::make_tuple(true, true), std::make_tuple(true, false), diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 28b481678ab..fd79ff1d0c1 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -7,24 +7,22 @@ #include "db/external_sst_file_ingestion_job.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include +#include #include +#include #include +#include "db/db_impl/db_impl.h" #include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" #include "table/merging_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "table/table_builder.h" -#include "util/file_reader_writer.h" -#include "util/file_util.h" +#include "test_util/sync_point.h" #include "util/stop_watch.h" -#include "util/sync_point.h" namespace rocksdb { @@ -66,65 +64,107 @@ Status ExternalSstFileIngestionJob::Prepare( std::sort( sorted_files.begin(), sorted_files.end(), [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { - return ucmp->Compare(info1->smallest_user_key, - info2->smallest_user_key) < 0; + return sstableKeyCompare(ucmp, info1->smallest_internal_key, + info2->smallest_internal_key) < 0; }); for (size_t i = 0; i < num_files - 1; i++) { - if (ucmp->Compare(sorted_files[i]->largest_user_key, - sorted_files[i + 1]->smallest_user_key) >= 0) { - return Status::NotSupported("Files have overlapping ranges"); + if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= 0) { + files_overlap_ = true; + break; } } } + if (ingestion_options_.ingest_behind && files_overlap_) { + return Status::NotSupported("Files have overlapping ranges"); + } + for (IngestedFileInfo& f : files_to_ingest_) { if (f.num_entries == 0 && f.num_range_deletions == 0) { return Status::InvalidArgument("File contain no entries"); } - if (!f.smallest_internal_key().Valid() || - !f.largest_internal_key().Valid()) { + if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { return Status::Corruption("Generated table have corrupted keys"); } } // Copy/Move external files into DB + std::unordered_set ingestion_path_ids; for (IngestedFileInfo& f : files_to_ingest_) { f.fd = FileDescriptor(next_file_number++, 0, f.file_size); - + f.copy_file = false; const std::string path_outside_db = f.external_file_path; const std::string path_inside_db = TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); - if (ingestion_options_.move_files) { status = env_->LinkFile(path_outside_db, path_inside_db); - if (status.IsNotSupported()) { - // Original file is on a different FS, use copy instead of hard linking - status = CopyFile(env_, path_outside_db, path_inside_db, 0, - db_options_.use_fsync); + if (status.ok()) { + // It is unsafe to assume application had sync the file and file + // directory before ingest the file. For integrity of RocksDB we need + // to sync the file. + std::unique_ptr file_to_sync; + status = env_->ReopenWritableFile(path_inside_db, &file_to_sync, + env_options_); + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } + } + } else if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { + // Original file is on a different FS, use copy instead of hard linking. f.copy_file = true; - } else { - f.copy_file = false; } } else { + f.copy_file = true; + } + + if (f.copy_file) { + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", + nullptr); + // CopyFile also sync the new file. status = CopyFile(env_, path_outside_db, path_inside_db, 0, db_options_.use_fsync); - f.copy_file = true; } TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); if (!status.ok()) { break; } f.internal_file_path = path_inside_db; + ingestion_path_ids.insert(f.fd.GetPathId()); + } + + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir"); + if (status.ok()) { + for (auto path_id : ingestion_path_ids) { + status = directories_->GetDataDir(path_id)->Fsync(); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync directory %" ROCKSDB_PRIszt + " while ingest file: %s", + path_id, status.ToString().c_str()); + break; + } + } } + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir"); + // TODO: The following is duplicated with Cleanup(). if (!status.ok()) { // We failed, remove all files that we copied into the db for (IngestedFileInfo& f : files_to_ingest_) { if (f.internal_file_path.empty()) { - break; + continue; } Status s = env_->DeleteFile(f.internal_file_path); if (!s.ok()) { @@ -142,8 +182,8 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, SuperVersion* super_version) { autovector ranges; for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) { - ranges.emplace_back(file_to_ingest.smallest_user_key, - file_to_ingest.largest_user_key); + ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(), + file_to_ingest.largest_internal_key.user_key()); } Status status = cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed); @@ -176,7 +216,7 @@ Status ExternalSstFileIngestionJob::Run() { } // It is safe to use this instead of LastAllocatedSequence since we are // the only active writer, and hence they are equal - const SequenceNumber last_seqno = versions_->LastSequence(); + SequenceNumber last_seqno = versions_->LastSequence(); edit_.SetColumnFamily(cfd_->GetID()); // The levels that the files will be ingested into @@ -186,8 +226,8 @@ Status ExternalSstFileIngestionJob::Run() { status = CheckLevelForIngestedBehindFile(&f); } else { status = AssignLevelAndSeqnoForIngestedFile( - super_version, force_global_seqno, cfd_->ioptions()->compaction_style, - &f, &assigned_seqno); + super_version, force_global_seqno, cfd_->ioptions()->compaction_style, + last_seqno, &f, &assigned_seqno); } if (!status.ok()) { return status; @@ -195,16 +235,30 @@ Status ExternalSstFileIngestionJob::Run() { status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", &assigned_seqno); - if (assigned_seqno == last_seqno + 1) { - consumed_seqno_ = true; + if (assigned_seqno > last_seqno) { + assert(assigned_seqno == last_seqno + 1); + last_seqno = assigned_seqno; + ++consumed_seqno_count_; } if (!status.ok()) { return status; } + + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t current_time = kUnknownFileCreationTime; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + if (env_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast(temp_current_time); + } + edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(), - f.fd.GetFileSize(), f.smallest_internal_key(), - f.largest_internal_key(), f.assigned_seqno, f.assigned_seqno, - false); + f.fd.GetFileSize(), f.smallest_internal_key, + f.largest_internal_key, f.assigned_seqno, f.assigned_seqno, + false, kInvalidBlobFileNumber, oldest_ancester_time, + current_time); } return status; } @@ -214,6 +268,13 @@ void ExternalSstFileIngestionJob::UpdateStats() { uint64_t total_keys = 0; uint64_t total_l0_files = 0; uint64_t total_time = env_->NowMicros() - job_start_time_; + + EventLoggerStream stream = event_logger_->Log(); + stream << "event" + << "ingest_finished"; + stream << "files_ingested"; + stream.StartArray(); + for (IngestedFileInfo& f : files_to_ingest_) { InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1); stats.micros = total_time; @@ -241,7 +302,18 @@ void ExternalSstFileIngestionJob::UpdateStats() { "(global_seqno=%" PRIu64 ")\n", f.external_file_path.c_str(), f.picked_level, f.internal_file_path.c_str(), f.assigned_seqno); + stream << "file" << f.internal_file_path << "level" << f.picked_level; + } + stream.EndArray(); + + stream << "lsm_state"; + stream.StartArray(); + auto vstorage = cfd_->current()->storage_info(); + for (int level = 0; level < vstorage->num_levels(); ++level) { + stream << vstorage->NumLevelFiles(level); } + stream.EndArray(); + cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL, total_keys); cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL, @@ -255,6 +327,9 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) { // We failed to add the files to the database // remove all the files we copied for (IngestedFileInfo& f : files_to_ingest_) { + if (f.internal_file_path.empty()) { + continue; + } Status s = env_->DeleteFile(f.internal_file_path); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, @@ -262,7 +337,8 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) { f.internal_file_path.c_str(), s.ToString().c_str()); } } - consumed_seqno_ = false; + consumed_seqno_count_ = 0; + files_overlap_ = false; } else if (status.ok() && ingestion_options_.move_files) { // The files were moved and added successfully, remove original file links for (IngestedFileInfo& f : files_to_ingest_) { @@ -311,7 +387,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( } if (ingestion_options_.verify_checksums_before_ingest) { - status = table_reader->VerifyChecksum(); + // If customized readahead size is needed, we can pass a user option + // all the way to here. Right now we just rely on the default readahead + // to keep things simple. + ReadOptions ro; + ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; + status = table_reader->VerifyChecksum( + ro, TableReaderCaller::kExternalSSTIngestion); } if (!status.ok()) { return status; @@ -371,11 +453,16 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( // updating the block cache. ro.fill_cache = false; std::unique_ptr iter(table_reader->NewIterator( - ro, sv->mutable_cf_options.prefix_extractor.get())); + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); std::unique_ptr range_del_iter( table_reader->NewRangeTombstoneIterator(ro)); // Get first (smallest) and last (largest) key from file. + file_to_ingest->smallest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); + file_to_ingest->largest_internal_key = + InternalKey("", 0, ValueType::kTypeValue); bool bounds_set = false; iter->SeekToFirst(); if (iter->Valid()) { @@ -385,7 +472,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( if (key.sequence != 0) { return Status::Corruption("external file have non zero sequence number"); } - file_to_ingest->smallest_user_key = key.user_key.ToString(); + file_to_ingest->smallest_internal_key.SetFrom(key); iter->SeekToLast(); if (!ParseInternalKey(iter->key(), &key)) { @@ -394,7 +481,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( if (key.sequence != 0) { return Status::Corruption("external file have non zero sequence number"); } - file_to_ingest->largest_user_key = key.user_key.ToString(); + file_to_ingest->largest_internal_key.SetFrom(key); bounds_set = true; } @@ -410,13 +497,17 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( } RangeTombstone tombstone(key, range_del_iter->value()); - if (!bounds_set || ucmp->Compare(tombstone.start_key_, - file_to_ingest->smallest_user_key) < 0) { - file_to_ingest->smallest_user_key = tombstone.start_key_.ToString(); + InternalKey start_key = tombstone.SerializeKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, start_key, + file_to_ingest->smallest_internal_key) < 0) { + file_to_ingest->smallest_internal_key = start_key; } - if (!bounds_set || ucmp->Compare(tombstone.end_key_, - file_to_ingest->largest_user_key) > 0) { - file_to_ingest->largest_user_key = tombstone.end_key_.ToString(); + InternalKey end_key = tombstone.SerializeEndKey(); + if (!bounds_set || + sstableKeyCompare(ucmp, end_key, + file_to_ingest->largest_internal_key) > 0) { + file_to_ingest->largest_internal_key = end_key; } bounds_set = true; } @@ -431,13 +522,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, - IngestedFileInfo* file_to_ingest, SequenceNumber* assigned_seqno) { + SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest, + SequenceNumber* assigned_seqno) { Status status; *assigned_seqno = 0; - const SequenceNumber last_seqno = versions_->LastSequence(); if (force_global_seqno) { *assigned_seqno = last_seqno + 1; - if (compaction_style == kCompactionStyleUniversal) { + if (compaction_style == kCompactionStyleUniversal || files_overlap_) { file_to_ingest->picked_level = 0; return status; } @@ -457,9 +548,10 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( if (vstorage->NumLevelFiles(lvl) > 0) { bool overlap_with_level = false; - status = sv->current->OverlapWithLevelIterator(ro, env_options_, - file_to_ingest->smallest_user_key, file_to_ingest->largest_user_key, - lvl, &overlap_with_level); + status = sv->current->OverlapWithLevelIterator( + ro, env_options_, file_to_ingest->smallest_internal_key.user_key(), + file_to_ingest->largest_internal_key.user_key(), lvl, + &overlap_with_level); if (!status.ok()) { return status; } @@ -499,6 +591,12 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( target_level = lvl; } } + // If files overlap, we have to ingest them at level 0 and assign the newest + // sequence number + if (files_overlap_) { + target_level = 0; + *assigned_seqno = last_seqno + 1; + } TEST_SYNC_POINT_CALLBACK( "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", &overlap_with_db); @@ -560,6 +658,18 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( std::string seqno_val; PutFixed64(&seqno_val, seqno); status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val); + if (status.ok()) { + TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno"); + status = SyncIngestedFile(rwfile.get()); + TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s after writing global " + "sequence number: %s", + file_to_ingest->internal_file_path.c_str(), + status.ToString().c_str()); + } + } if (!status.ok()) { return status; } @@ -580,8 +690,9 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( } auto* vstorage = cfd_->current()->storage_info(); - Slice file_smallest_user_key(file_to_ingest->smallest_user_key); - Slice file_largest_user_key(file_to_ingest->largest_user_key); + Slice file_smallest_user_key( + file_to_ingest->smallest_internal_key.user_key()); + Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key()); if (vstorage->OverlapInLevel(level, &file_smallest_user_key, &file_largest_user_key)) { @@ -600,6 +711,16 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( return true; } +template +Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { + assert(file != nullptr); + if (db_options_.use_fsync) { + return file->Fsync(); + } else { + return file->Sync(); + } +} + } // namespace rocksdb #endif // !ROCKSDB_LITE diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index baa8e9f0f64..90b8326bbef 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -12,6 +12,7 @@ #include "db/dbformat.h" #include "db/internal_stats.h" #include "db/snapshot_impl.h" +#include "logging/event_logger.h" #include "options/db_options.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -20,13 +21,15 @@ namespace rocksdb { +class Directories; + struct IngestedFileInfo { // External file path std::string external_file_path; - // Smallest user key in external file - std::string smallest_user_key; - // Largest user key in external file - std::string largest_user_key; + // Smallest internal key in external file + InternalKey smallest_internal_key; + // Largest internal key in external file + InternalKey largest_internal_key; // Sequence number for keys in external file SequenceNumber original_seqno; // Offset of the global sequence number field in the file, will @@ -60,15 +63,6 @@ struct IngestedFileInfo { // ingestion_options.move_files is false by default, thus copy_file is true // by default. bool copy_file = true; - - InternalKey smallest_internal_key() const { - return InternalKey(smallest_user_key, assigned_seqno, - ValueType::kTypeValue); - } - - InternalKey largest_internal_key() const { - return InternalKey(largest_user_key, assigned_seqno, ValueType::kTypeValue); - } }; class ExternalSstFileIngestionJob { @@ -77,7 +71,8 @@ class ExternalSstFileIngestionJob { Env* env, VersionSet* versions, ColumnFamilyData* cfd, const ImmutableDBOptions& db_options, const EnvOptions& env_options, SnapshotList* db_snapshots, - const IngestExternalFileOptions& ingestion_options) + const IngestExternalFileOptions& ingestion_options, + Directories* directories, EventLogger* event_logger) : env_(env), versions_(versions), cfd_(cfd), @@ -85,8 +80,12 @@ class ExternalSstFileIngestionJob { env_options_(env_options), db_snapshots_(db_snapshots), ingestion_options_(ingestion_options), + directories_(directories), + event_logger_(event_logger), job_start_time_(env_->NowMicros()), - consumed_seqno_(false) {} + consumed_seqno_count_(0) { + assert(directories != nullptr); + } // Prepare the job by copying external files into the DB. Status Prepare(const std::vector& external_files_paths, @@ -119,8 +118,8 @@ class ExternalSstFileIngestionJob { return files_to_ingest_; } - // Whether to increment VersionSet's seqno after this job runs - bool ShouldIncrementLastSequence() const { return consumed_seqno_; } + // How many sequence numbers did we consume as part of the ingest job? + int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; } private: // Open the external file and populate `file_to_ingest` with all the @@ -135,6 +134,7 @@ class ExternalSstFileIngestionJob { Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, + SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest, SequenceNumber* assigned_seqno); @@ -153,6 +153,10 @@ class ExternalSstFileIngestionJob { bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest, int level); + // Helper method to sync given file. + template + Status SyncIngestedFile(TWritableFile* file); + Env* env_; VersionSet* versions_; ColumnFamilyData* cfd_; @@ -161,9 +165,14 @@ class ExternalSstFileIngestionJob { SnapshotList* db_snapshots_; autovector files_to_ingest_; const IngestExternalFileOptions& ingestion_options_; + Directories* directories_; + EventLogger* event_logger_; VersionEdit edit_; uint64_t job_start_time_; - bool consumed_seqno_; + int consumed_seqno_count_; + // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are + // ingested in L0 + bool files_overlap_{false}; }; } // namespace rocksdb diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index cbbb2fa2627..3a059773f33 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -7,15 +7,63 @@ #include #include "db/db_test_util.h" +#include "file/filename.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/sst_file_writer.h" -#include "util/fault_injection_test_env.h" -#include "util/filename.h" -#include "util/testutil.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/testutil.h" namespace rocksdb { +// A test environment that can be configured to fail the Link operation. +class ExternalSSTTestEnv : public EnvWrapper { + public: + ExternalSSTTestEnv(Env* t, bool fail_link) + : EnvWrapper(t), fail_link_(fail_link) {} + + Status LinkFile(const std::string& s, const std::string& t) override { + if (fail_link_) { + return Status::NotSupported("Link failed"); + } + return target()->LinkFile(s, t); + } + + void set_fail_link(bool fail_link) { fail_link_ = fail_link; } + + private: + bool fail_link_; +}; + +class ExternSSTFileLinkFailFallbackTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + ExternSSTFileLinkFailFallbackTest() + : DBTestBase("/external_sst_file_test"), + test_env_(new ExternalSSTTestEnv(env_, true)) { + sst_files_dir_ = dbname_ + "/sst_files/"; + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + options_ = CurrentOptions(); + options_.disable_auto_compactions = true; + options_.env = test_env_; + } + + void TearDown() override { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options_)); + delete test_env_; + test_env_ = nullptr; + } + + protected: + std::string sst_files_dir_; + Options options_; + ExternalSSTTestEnv* test_env_; +}; + class ExternalSSTFileTest : public DBTestBase, public ::testing::WithParamInterface> { @@ -648,10 +696,10 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_EQ(file6_info.smallest_range_del_key, Key(0)); ASSERT_EQ(file6_info.largest_range_del_key, Key(100)); - // file7.sst (delete 100 => 200) + // file7.sst (delete 99 => 201) std::string file7 = sst_files_dir_ + "file7.sst"; ASSERT_OK(sst_file_writer.Open(file7)); - ASSERT_OK(sst_file_writer.DeleteRange(Key(100), Key(200))); + ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201))); ExternalSstFileInfo file7_info; s = sst_file_writer.Finish(&file7_info); ASSERT_TRUE(s.ok()) << s.ToString(); @@ -660,8 +708,8 @@ TEST_F(ExternalSSTFileTest, AddList) { ASSERT_EQ(file7_info.smallest_key, ""); ASSERT_EQ(file7_info.largest_key, ""); ASSERT_EQ(file7_info.num_range_del_entries, 1); - ASSERT_EQ(file7_info.smallest_range_del_key, Key(100)); - ASSERT_EQ(file7_info.largest_range_del_key, Key(200)); + ASSERT_EQ(file7_info.smallest_range_del_key, Key(99)); + ASSERT_EQ(file7_info.largest_range_del_key, Key(201)); // list 1 has internal key range conflict std::vector file_list0({file1, file2}); @@ -676,9 +724,7 @@ TEST_F(ExternalSSTFileTest, AddList) { // These lists of files have key ranges that overlap with each other s = DeprecatedAddFile(file_list1); ASSERT_FALSE(s.ok()) << s.ToString(); - // Both of the following overlap on the end key of a range deletion - // tombstone. This is a limitation because these tombstones have exclusive - // end keys that should not count as overlapping with other keys. + // Both of the following overlap on the range deletion tombstone. s = DeprecatedAddFile(file_list4); ASSERT_FALSE(s.ok()) << s.ToString(); s = DeprecatedAddFile(file_list5); @@ -2014,17 +2060,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) { } /* - * Test and verify the functionality of ingestion_options.move_files. + * Test and verify the functionality of ingestion_options.move_files and + * ingestion_options.failed_move_fall_back_to_copy */ -TEST_F(ExternalSSTFileTest, LinkExternalSst) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - DestroyAndReopen(options); +TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) { + const bool fail_link = std::get<0>(GetParam()); + const bool failed_move_fall_back_to_copy = std::get<1>(GetParam()); + test_env_->set_fail_link(fail_link); + const EnvOptions env_options; + DestroyAndReopen(options_); const int kNumKeys = 10000; + IngestExternalFileOptions ifo; + ifo.move_files = true; + ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy; std::string file_path = sst_files_dir_ + "file1.sst"; // Create SstFileWriter for default column family - SstFileWriter sst_file_writer(EnvOptions(), options); + SstFileWriter sst_file_writer(env_options, options_); ASSERT_OK(sst_file_writer.Open(file_path)); for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value")); @@ -2033,9 +2085,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) { uint64_t file_size = 0; ASSERT_OK(env_->GetFileSize(file_path, &file_size)); - IngestExternalFileOptions ifo; - ifo.move_files = true; - ASSERT_OK(db_->IngestExternalFile({file_path}, ifo)); + bool copyfile = false; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:CopyFile", + [&](void* /* arg */) { copyfile = true; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + const Status s = db_->IngestExternalFile({file_path}, ifo); ColumnFamilyHandleImpl* cfh = static_cast(dbfull()->DefaultColumnFamily()); @@ -2049,18 +2105,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) { bytes_copied += stats.bytes_written; bytes_moved += stats.bytes_moved; } - // If bytes_moved > 0, it means external sst resides on the same FS - // supporting hard link operation. Therefore, - // 0 bytes should be copied, and the bytes_moved == file_size. - // Otherwise, FS does not support hard link, or external sst file resides on - // a different file system, then the bytes_copied should be equal to - // file_size. - if (bytes_moved > 0) { + + if (!fail_link) { + // Link operation succeeds. External SST should be moved. + ASSERT_OK(s); ASSERT_EQ(0, bytes_copied); ASSERT_EQ(file_size, bytes_moved); + ASSERT_FALSE(copyfile); } else { - ASSERT_EQ(file_size, bytes_copied); + // Link operation fails. + ASSERT_EQ(0, bytes_moved); + if (failed_move_fall_back_to_copy) { + ASSERT_OK(s); + // Copy file is true since a failed link falls back to copy file. + ASSERT_TRUE(copyfile); + ASSERT_EQ(file_size, bytes_copied); + } else { + ASSERT_TRUE(s.IsNotSupported()); + // Copy file is false since a failed link does not fall back to copy file. + ASSERT_FALSE(copyfile); + ASSERT_EQ(0, bytes_copied); + } } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } class TestIngestExternalFileListener : public EventListener { @@ -2300,10 +2367,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { new FaultInjectionTestEnv(env_)); Options options = CurrentOptions(); options.env = fault_injection_env.get(); - CreateAndReopenWithCF({"pikachu"}, options); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); std::vector column_families; column_families.push_back(handles_[0]); column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); std::vector ifos(column_families.size()); for (auto& ifo : ifos) { ifo.allow_global_seqno = true; // Always allow global_seqno @@ -2317,6 +2385,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); data.push_back( {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + // Resize the true_data vector upon construction to avoid re-alloc std::vector> true_data( column_families.size()); @@ -2324,8 +2395,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) { -1, true, true_data); ASSERT_OK(s); Close(); - ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); - ASSERT_EQ(2, handles_.size()); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); int cf = 0; for (const auto& verify_map : true_data) { for (const auto& elem : verify_map) { @@ -2357,10 +2429,11 @@ TEST_P(ExternalSSTFileTest, Options options = CurrentOptions(); options.env = fault_injection_env.get(); - CreateAndReopenWithCF({"pikachu"}, options); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); const std::vector> data_before_ingestion = {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}}, - {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}}}; + {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}}, + {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}}; for (size_t i = 0; i != handles_.size(); ++i) { int cf = static_cast(i); const auto& orig_data = data_before_ingestion[i]; @@ -2373,6 +2446,7 @@ TEST_P(ExternalSSTFileTest, std::vector column_families; column_families.push_back(handles_[0]); column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); std::vector ifos(column_families.size()); for (auto& ifo : ifos) { ifo.allow_global_seqno = true; // Always allow global_seqno @@ -2386,6 +2460,8 @@ TEST_P(ExternalSSTFileTest, {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); data.push_back( {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); // Resize the true_data vector upon construction to avoid re-alloc std::vector> true_data( column_families.size()); @@ -2439,10 +2515,11 @@ TEST_P(ExternalSSTFileTest, dbfull()->ReleaseSnapshot(read_opts.snapshot); Close(); - ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); // Should see consistent state after ingestion for all column families even // without snapshot. - ASSERT_EQ(2, handles_.size()); + ASSERT_EQ(3, handles_.size()); int cf = 0; for (const auto& verify_map : true_data) { for (const auto& elem : verify_map) { @@ -2472,10 +2549,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) { "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"}, }); SyncPoint::GetInstance()->EnableProcessing(); - CreateAndReopenWithCF({"pikachu"}, options); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); std::vector column_families; column_families.push_back(handles_[0]); column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); std::vector ifos(column_families.size()); for (auto& ifo : ifos) { ifo.allow_global_seqno = true; // Always allow global_seqno @@ -2489,6 +2567,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) { {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); data.push_back( {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); + // Resize the true_data vector upon construction to avoid re-alloc std::vector> true_data( column_families.size()); @@ -2508,8 +2589,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) { fault_injection_env->SetFilesystemActive(true); Close(); - ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); - ASSERT_EQ(2, handles_.size()); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); int cf = 0; for (const auto& verify_map : true_data) { for (const auto& elem : verify_map) { @@ -2538,10 +2620,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) { "DBImpl::IngestExternalFiles:BeforeJobsRun:1"}, }); SyncPoint::GetInstance()->EnableProcessing(); - CreateAndReopenWithCF({"pikachu"}, options); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); std::vector column_families; column_families.push_back(handles_[0]); column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); std::vector ifos(column_families.size()); for (auto& ifo : ifos) { ifo.allow_global_seqno = true; // Always allow global_seqno @@ -2555,6 +2638,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) { {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); data.push_back( {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); // Resize the true_data vector upon construction to avoid re-alloc std::vector> true_data( column_families.size()); @@ -2574,8 +2659,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) { fault_injection_env->SetFilesystemActive(true); Close(); - ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); - ASSERT_EQ(2, handles_.size()); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); int cf = 0; for (const auto& verify_map : true_data) { for (const auto& elem : verify_map) { @@ -2595,7 +2681,7 @@ TEST_P(ExternalSSTFileTest, Options options = CurrentOptions(); options.env = fault_injection_env.get(); - CreateAndReopenWithCF({"pikachu"}, options); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); SyncPoint::GetInstance()->ClearTrace(); SyncPoint::GetInstance()->DisableProcessing(); @@ -2613,6 +2699,7 @@ TEST_P(ExternalSSTFileTest, std::vector column_families; column_families.push_back(handles_[0]); column_families.push_back(handles_[1]); + column_families.push_back(handles_[2]); std::vector ifos(column_families.size()); for (auto& ifo : ifos) { ifo.allow_global_seqno = true; // Always allow global_seqno @@ -2626,6 +2713,8 @@ TEST_P(ExternalSSTFileTest, {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")}); data.push_back( {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")}); + data.push_back( + {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")}); // Resize the true_data vector upon construction to avoid re-alloc std::vector> true_data( column_families.size()); @@ -2646,8 +2735,9 @@ TEST_P(ExternalSSTFileTest, fault_injection_env->DropUnsyncedFileData(); fault_injection_env->SetFilesystemActive(true); Close(); - ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); - ASSERT_EQ(2, handles_.size()); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, + options); + ASSERT_EQ(3, handles_.size()); int cf = 0; for (const auto& verify_map : true_data) { for (const auto& elem : verify_map) { @@ -2660,12 +2750,38 @@ TEST_P(ExternalSSTFileTest, Destroy(options, true /* delete_cf_paths */); } +TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) { + Options options = CurrentOptions(); + // Use large buffer to avoid memtable flush + options.write_buffer_size = 1024 * 1024; + options.two_write_queues = true; + DestroyAndReopen(options); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1")); + ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1")); + + // Put one key which is overlap with keys in memtable. + // It will trigger flushing memtable and require this thread is + // currently at the front of the 2nd writer queue. We must make + // sure that it won't enter the 2nd writer queue for the second time. + std::vector> data; + data.push_back(std::make_pair("1001", "v2")); + GenerateAndAddExternalFile(options, data); +} + INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest, testing::Values(std::make_tuple(false, false), std::make_tuple(false, true), std::make_tuple(true, false), std::make_tuple(true, true))); +INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest, + ExternSSTFileLinkFailFallbackTest, + testing::Values(std::make_tuple(true, false), + std::make_tuple(true, true), + std::make_tuple(false, false))); + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 53de312c017..1d18569f2f4 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -11,22 +11,22 @@ // the last "sync". It then checks for data loss errors by purposely dropping // file data (or entire files) not protected by a "sync". -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/log_format.h" #include "db/version_set.h" #include "env/mock_env.h" +#include "file/filename.h" +#include "logging/logging.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" -#include "util/fault_injection_test_env.h" -#include "util/filename.h" -#include "util/logging.h" +#include "test_util/fault_injection_test_env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/mutexlock.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { diff --git a/db/file_indexer.h b/db/file_indexer.h index 1bef3aab0ca..2091f80292b 100644 --- a/db/file_indexer.h +++ b/db/file_indexer.h @@ -12,8 +12,8 @@ #include #include #include +#include "memory/arena.h" #include "port/port.h" -#include "util/arena.h" #include "util/autovector.h" namespace rocksdb { diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc index 935a01ef8dd..6942aa682d6 100644 --- a/db/file_indexer_test.cc +++ b/db/file_indexer_test.cc @@ -7,14 +7,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include #include "db/file_indexer.h" +#include #include "db/dbformat.h" #include "db/version_edit.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/filename_test.cc b/db/filename_test.cc index d6bde52834e..bc52e0eae64 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -7,12 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/filename.h" +#include "file/filename.h" #include "db/dbformat.h" +#include "logging/logging.h" #include "port/port.h" -#include "util/logging.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/flush_job.cc b/db/flush_job.cc index 4226589e79d..bdb4c179bd8 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -9,11 +9,7 @@ #include "db/flush_job.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include @@ -29,6 +25,11 @@ #include "db/merge_context.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_set.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "logging/event_logger.h" +#include "logging/log_buffer.h" +#include "logging/logging.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" @@ -38,20 +39,15 @@ #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" +#include "test_util/sync_point.h" #include "util/coding.h" -#include "util/event_logger.h" -#include "util/file_util.h" -#include "util/filename.h" -#include "util/log_buffer.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/stop_watch.h" -#include "util/sync_point.h" namespace rocksdb { @@ -229,10 +225,12 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, // This will release and re-acquire the mutex. Status s = WriteLevel0Table(); - if (s.ok() && - (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) { - s = Status::ShutdownInProgress( - "Database shutdown or Column family drop during flush"); + if (s.ok() && cfd_->IsDropped()) { + s = Status::ColumnFamilyDropped("Column family dropped during compaction"); + } + if ((s.ok() || s.IsColumnFamilyDropped()) && + shutting_down_->load(std::memory_order_acquire)) { + s = Status::ShutdownInProgress("Database shutdown"); } if (!s.ok()) { @@ -243,7 +241,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, s = cfd_->imm()->TryInstallMemtableFlushResults( cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_, meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, - log_buffer_); + log_buffer_, &committed_flush_jobs_info_); } if (s.ok() && file_meta != nullptr) { @@ -367,6 +365,11 @@ Status FlushJob::WriteLevel0Table() { uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime(); + // It's not clear whether oldest_key_time is always available. In case + // it is not available, use current_time. + meta_.oldest_ancester_time = std::min(current_time, oldest_key_time); + meta_.file_creation_time = current_time; + s = BuildTable( dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_, env_options_, cfd_->table_cache(), iter.get(), @@ -394,7 +397,7 @@ Status FlushJob::WriteLevel0Table() { if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) { s = output_file_directory_->Fsync(); } - TEST_SYNC_POINT("FlushJob::WriteLevel0Table"); + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_); db_mutex_->Lock(); } base_->Unref(); @@ -410,8 +413,13 @@ Status FlushJob::WriteLevel0Table() { edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(), meta_.fd.GetFileSize(), meta_.smallest, meta_.largest, meta_.fd.smallest_seqno, meta_.fd.largest_seqno, - meta_.marked_for_compaction); + meta_.marked_for_compaction, meta_.oldest_blob_file_number, + meta_.oldest_ancester_time, meta_.file_creation_time); } +#ifndef ROCKSDB_LITE + // Piggyback FlushJobInfo on the first first flushed memtable. + mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); +#endif // !ROCKSDB_LITE // Note that here we treat flush as level 0 compaction in internal stats InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); @@ -426,4 +434,26 @@ Status FlushJob::WriteLevel0Table() { return s; } +#ifndef ROCKSDB_LITE +std::unique_ptr FlushJob::GetFlushJobInfo() const { + db_mutex_->AssertHeld(); + std::unique_ptr info(new FlushJobInfo{}); + info->cf_id = cfd_->GetID(); + info->cf_name = cfd_->GetName(); + + const uint64_t file_number = meta_.fd.GetNumber(); + info->file_path = + MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number); + info->file_number = file_number; + info->oldest_blob_file_number = meta_.oldest_blob_file_number; + info->thread_id = db_options_.env->GetThreadID(); + info->job_id = job_context_->job_id; + info->smallest_seqno = meta_.fd.smallest_seqno; + info->largest_seqno = meta_.fd.largest_seqno; + info->table_properties = table_properties_; + info->flush_reason = cfd_->GetFlushReason(); + return info; +} +#endif // !ROCKSDB_LITE + } // namespace rocksdb diff --git a/db/flush_job.h b/db/flush_job.h index c4081945623..b25aca3529c 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -11,10 +11,11 @@ #include #include #include +#include #include +#include #include #include -#include #include "db/column_family.h" #include "db/dbformat.h" @@ -28,16 +29,17 @@ #include "db/version_edit.h" #include "db/write_controller.h" #include "db/write_thread.h" +#include "logging/event_logger.h" #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/listener.h" #include "rocksdb/memtablerep.h" #include "rocksdb/transaction_log.h" #include "table/scoped_arena_iterator.h" #include "util/autovector.h" -#include "util/event_logger.h" #include "util/stop_watch.h" #include "util/thread_local.h" @@ -79,14 +81,22 @@ class FlushJob { Status Run(LogsWithPrepTracker* prep_tracker = nullptr, FileMetaData* file_meta = nullptr); void Cancel(); - TableProperties GetTableProperties() const { return table_properties_; } const autovector& GetMemTables() const { return mems_; } +#ifndef ROCKSDB_LITE + std::list>* GetCommittedFlushJobsInfo() { + return &committed_flush_jobs_info_; + } +#endif // !ROCKSDB_LITE + private: void ReportStartedFlush(); void ReportFlushInputSize(const autovector& mems); void RecordFlushIOStats(); Status WriteLevel0Table(); +#ifndef ROCKSDB_LITE + std::unique_ptr GetFlushJobInfo() const; +#endif // !ROCKSDB_LITE const std::string& dbname_; ColumnFamilyData* cfd_; @@ -131,6 +141,10 @@ class FlushJob { // In this case, only after all flush jobs succeed in flush can RocksDB // commit to the MANIFEST. const bool write_manifest_; + // The current flush job can commit flush result of a concurrent flush job. + // We collect FlushJobInfo of all jobs committed by current job and fire + // OnFlushCompleted for them. + std::list> committed_flush_jobs_info_; // Variables below are set by PickMemTable(): FileMetaData meta_; diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 199ed29cacc..fec7379427a 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -4,19 +4,22 @@ // (found in the LICENSE.Apache file in the root directory). #include +#include #include #include +#include "db/blob_index.h" #include "db/column_family.h" +#include "db/db_impl/db_impl.h" #include "db/flush_job.h" #include "db/version_set.h" +#include "file/writable_file_writer.h" #include "rocksdb/cache.h" #include "rocksdb/write_buffer_manager.h" #include "table/mock_table.h" -#include "util/file_reader_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -35,7 +38,8 @@ class FlushJobTest : public testing::Test { write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)), + &write_controller_, + /*block_cache_tracer=*/nullptr)), shutting_down_(false), mock_table_factory_(new mock::MockTableFactory()) { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); @@ -54,7 +58,14 @@ class FlushJobTest : public testing::Test { } void NewDB() { + SetIdentityFile(env_, dbname_); VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBImpl* impl = new DBImpl(DBOptions(), dbname_); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); @@ -145,6 +156,7 @@ TEST_F(FlushJobTest, NonEmpty) { // seqno [ 1, 2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ] // key [ 1001, 1002 ... 9998, 9999, 0, 1, 2 ... 999 ] // range-delete "9995" -> "9999" at seqno 10000 + // blob references with seqnos 10001..10006 for (int i = 1; i < 10000; ++i) { std::string key(ToString((i + 1000) % 10000)); std::string value("value" + key); @@ -154,9 +166,43 @@ TEST_F(FlushJobTest, NonEmpty) { inserted_keys.insert({internal_key.Encode().ToString(), value}); } } - new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a"); - InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion); - inserted_keys.insert({internal_key.Encode().ToString(), "9999a"}); + + { + new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a"); + InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion); + inserted_keys.insert({internal_key.Encode().ToString(), "9999a"}); + } + +#ifndef ROCKSDB_LITE + // Note: the first two blob references will not be considered when resolving + // the oldest blob file referenced (the first one is inlined TTL, while the + // second one is TTL and thus points to a TTL blob file). + constexpr std::array blob_file_numbers{ + kInvalidBlobFileNumber, 5, 103, 17, 102, 101}; + for (size_t i = 0; i < blob_file_numbers.size(); ++i) { + std::string key(ToString(i + 10001)); + std::string blob_index; + if (i == 0) { + BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL, + "foo"); + } else if (i == 1) { + BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL, + blob_file_numbers[i], /* offset */ i << 10, + /* size */ i << 20, kNoCompression); + } else { + BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i], + /* offset */ i << 10, /* size */ i << 20, + kNoCompression); + } + + const SequenceNumber seq(i + 10001); + new_mem->Add(seq, kTypeBlobIndex, key, blob_index); + + InternalKey internal_key(key, seq, kTypeBlobIndex); + inserted_keys.emplace_hint(inserted_keys.end(), + internal_key.Encode().ToString(), blob_index); + } +#endif autovector to_delete; cfd->imm()->Add(new_mem, &to_delete); @@ -185,11 +231,14 @@ TEST_F(FlushJobTest, NonEmpty) { ASSERT_GT(hist.average, 0.0); ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString()); - ASSERT_EQ( - "9999a", - file_meta.largest.user_key().ToString()); // range tombstone end key + ASSERT_EQ("9999a", file_meta.largest.user_key().ToString()); ASSERT_EQ(1, file_meta.fd.smallest_seqno); - ASSERT_EQ(10000, file_meta.fd.largest_seqno); // range tombstone seqnum 10000 +#ifndef ROCKSDB_LITE + ASSERT_EQ(10006, file_meta.fd.largest_seqno); + ASSERT_EQ(17, file_meta.oldest_blob_file_number); +#else + ASSERT_EQ(10000, file_meta.fd.largest_seqno); +#endif mock_table_factory_->AssertSingleFile(inserted_keys); job_context.Clean(); } @@ -252,6 +301,7 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) { ASSERT_EQ(0, file_meta.fd.smallest_seqno); ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1), file_meta.fd.largest_seqno); + ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number); for (auto m : to_delete) { delete m; @@ -297,18 +347,18 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { EventLogger event_logger(db_options_.info_log.get()); SnapshotChecker* snapshot_checker = nullptr; // not relevant - std::vector flush_jobs; + std::vector> flush_jobs; k = 0; for (auto cfd : all_cfds) { std::vector snapshot_seqs; - flush_jobs.emplace_back( + flush_jobs.emplace_back(new FlushJob( dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), &memtable_ids[k], env_options_, versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, true, false /* sync_output_directory */, false /* write_manifest */, - Env::Priority::USER); + Env::Priority::USER)); k++; } HistogramData hist; @@ -317,12 +367,12 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { file_metas.reserve(flush_jobs.size()); mutex_.Lock(); for (auto& job : flush_jobs) { - job.PickMemTable(); + job->PickMemTable(); } for (auto& job : flush_jobs) { FileMetaData meta; // Run will release and re-acquire mutex - ASSERT_OK(job.Run(nullptr /**/, &meta)); + ASSERT_OK(job->Run(nullptr /**/, &meta)); file_metas.emplace_back(meta); } autovector file_meta_ptrs; @@ -331,7 +381,7 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { } autovector*> mems_list; for (size_t i = 0; i != all_cfds.size(); ++i) { - const auto& mems = flush_jobs[i].GetMemTables(); + const auto& mems = flush_jobs[i]->GetMemTables(); mems_list.push_back(&mems); } autovector mutable_cf_options_list; diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc index 8735a6b369b..cbcb5ce49f9 100644 --- a/db/flush_scheduler.cc +++ b/db/flush_scheduler.cc @@ -11,11 +11,13 @@ namespace rocksdb { -void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { +void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) { #ifndef NDEBUG - std::lock_guard lock(checking_mutex_); - assert(checking_set_.count(cfd) == 0); - checking_set_.insert(cfd); + { + std::lock_guard lock(checking_mutex_); + assert(checking_set_.count(cfd) == 0); + checking_set_.insert(cfd); + } #endif // NDEBUG cfd->Ref(); // Suppress false positive clang analyzer warnings. @@ -32,9 +34,6 @@ void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { } ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { -#ifndef NDEBUG - std::lock_guard lock(checking_mutex_); -#endif // NDEBUG while (true) { if (head_.load(std::memory_order_relaxed) == nullptr) { return nullptr; @@ -47,9 +46,12 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { delete node; #ifndef NDEBUG - auto iter = checking_set_.find(cfd); - assert(iter != checking_set_.end()); - checking_set_.erase(iter); + { + std::lock_guard lock(checking_mutex_); + auto iter = checking_set_.find(cfd); + assert(iter != checking_set_.end()); + checking_set_.erase(iter); + } #endif // NDEBUG if (!cfd->IsDropped()) { @@ -65,12 +67,12 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() { } bool FlushScheduler::Empty() { -#ifndef NDEBUG - std::lock_guard lock(checking_mutex_); -#endif // NDEBUG auto rv = head_.load(std::memory_order_relaxed) == nullptr; #ifndef NDEBUG - assert(rv == checking_set_.empty()); + std::lock_guard lock(checking_mutex_); + // Empty is allowed to be called concurrnetly with ScheduleFlush. It would + // only miss the recent schedules. + assert((rv == checking_set_.empty()) || rv); #endif // NDEBUG return rv; } diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h index cd3575861a8..5ca85e88bcf 100644 --- a/db/flush_scheduler.h +++ b/db/flush_scheduler.h @@ -9,25 +9,31 @@ #include #include #include +#include "util/autovector.h" namespace rocksdb { class ColumnFamilyData; -// Unless otherwise noted, all methods on FlushScheduler should be called -// only with the DB mutex held or from a single-threaded recovery context. +// FlushScheduler keeps track of all column families whose memtable may +// be full and require flushing. Unless otherwise noted, all methods on +// FlushScheduler should be called only with the DB mutex held or from +// a single-threaded recovery context. class FlushScheduler { public: FlushScheduler() : head_(nullptr) {} // May be called from multiple threads at once, but not concurrent with // any other method calls on this instance - void ScheduleFlush(ColumnFamilyData* cfd); + void ScheduleWork(ColumnFamilyData* cfd); // Removes and returns Ref()-ed column family. Client needs to Unref(). // Filters column families that have been dropped. ColumnFamilyData* TakeNextColumnFamily(); + // This can be called concurrently with ScheduleWork but it would miss all + // the scheduled flushes after the last synchronization. This would result + // into less precise enforcement of memtable sizes but should not matter much. bool Empty(); void Clear(); diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 94e448ee97d..c875008c769 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -11,7 +11,7 @@ #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" #include "db/job_context.h" @@ -21,8 +21,8 @@ #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "table/merging_iterator.h" +#include "test_util/sync_point.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { @@ -79,7 +79,11 @@ class ForwardLevelIterator : public InternalIterator { read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), *files_[file_index_], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - prefix_extractor_, nullptr /* table_reader_ptr */, nullptr, false); + prefix_extractor_, /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, + /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); valid_ = false; if (!range_del_agg.IsEmpty()) { @@ -642,7 +646,12 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { l0_iters_.push_back(cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0, read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - sv_->mutable_cf_options.prefix_extractor.get())); + sv_->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); } BuildLevelIterators(vstorage); current_ = nullptr; @@ -714,7 +723,12 @@ void ForwardIterator::RenewIterators() { read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0_files_new[inew], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - svnew->mutable_cf_options.prefix_extractor.get())); + svnew->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); } for (auto* f : l0_iters_) { @@ -772,8 +786,13 @@ void ForwardIterator::ResetIncompleteIterators() { DeleteIterator(l0_iters_[i]); l0_iters_[i] = cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), - *l0_files[i], nullptr /* range_del_agg */, - sv_->mutable_cf_options.prefix_extractor.get()); + *l0_files[i], /*range_del_agg=*/nullptr, + sv_->mutable_cf_options.prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kUserIterator, /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/-1, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); } diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 146588d961c..fb73f458edd 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -10,12 +10,12 @@ #include #include +#include "db/dbformat.h" +#include "memory/arena.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" -#include "db/dbformat.h" #include "table/internal_iterator.h" -#include "util/arena.h" namespace rocksdb { diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc index 113ded94b69..174a258a682 100644 --- a/db/forward_iterator_bench.cc +++ b/db/forward_iterator_bench.cc @@ -3,10 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #if !defined(GFLAGS) || defined(ROCKSDB_LITE) #include int main() { @@ -34,8 +30,8 @@ int main() { return 0; } #include "rocksdb/db.h" #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "test_util/testharness.h" #include "util/gflags_compat.h" -#include "util/testharness.h" const int MAX_SHARDS = 100000; diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc new file mode 100644 index 00000000000..f52418a0781 --- /dev/null +++ b/db/import_column_family_job.cc @@ -0,0 +1,269 @@ +#ifndef ROCKSDB_LITE + +#include "db/import_column_family_job.h" + +#include +#include +#include +#include + +#include "db/version_edit.h" +#include "file/file_util.h" +#include "file/random_access_file_reader.h" +#include "table/merging_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/sst_file_writer_collectors.h" +#include "table/table_builder.h" +#include "util/stop_watch.h" + +namespace rocksdb { + +Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, + SuperVersion* sv) { + Status status; + + // Read the information of files we are importing + for (const auto& file_metadata : metadata_) { + const auto file_path = file_metadata.db_path + "/" + file_metadata.name; + IngestedFileInfo file_to_import; + status = GetIngestedFileInfo(file_path, &file_to_import, sv); + if (!status.ok()) { + return status; + } + files_to_import_.push_back(file_to_import); + } + + const auto ucmp = cfd_->internal_comparator().user_comparator(); + auto num_files = files_to_import_.size(); + if (num_files == 0) { + return Status::InvalidArgument("The list of files is empty"); + } else if (num_files > 1) { + // Verify that passed files don't have overlapping ranges in any particular + // level. + int min_level = 1; // Check for overlaps in Level 1 and above. + int max_level = -1; + for (const auto& file_metadata : metadata_) { + if (file_metadata.level > max_level) { + max_level = file_metadata.level; + } + } + for (int level = min_level; level <= max_level; ++level) { + autovector sorted_files; + for (size_t i = 0; i < num_files; i++) { + if (metadata_[i].level == level) { + sorted_files.push_back(&files_to_import_[i]); + } + } + + std::sort(sorted_files.begin(), sorted_files.end(), + [&ucmp](const IngestedFileInfo* info1, + const IngestedFileInfo* info2) { + return sstableKeyCompare(ucmp, info1->smallest_internal_key, + info2->smallest_internal_key) < 0; + }); + + for (size_t i = 0; i < sorted_files.size() - 1; i++) { + if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, + sorted_files[i + 1]->smallest_internal_key) >= + 0) { + return Status::InvalidArgument("Files have overlapping ranges"); + } + } + } + } + + for (const auto& f : files_to_import_) { + if (f.num_entries == 0) { + return Status::InvalidArgument("File contain no entries"); + } + + if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { + return Status::Corruption("File has corrupted keys"); + } + } + + // Copy/Move external files into DB + auto hardlink_files = import_options_.move_files; + for (auto& f : files_to_import_) { + f.fd = FileDescriptor(next_file_number++, 0, f.file_size); + + const auto path_outside_db = f.external_file_path; + const auto path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + + if (hardlink_files) { + status = env_->LinkFile(path_outside_db, path_inside_db); + if (status.IsNotSupported()) { + // Original file is on a different FS, use copy instead of hard linking + hardlink_files = false; + } + } + if (!hardlink_files) { + status = CopyFile(env_, path_outside_db, path_inside_db, 0, + db_options_.use_fsync); + } + if (!status.ok()) { + break; + } + f.copy_file = !hardlink_files; + f.internal_file_path = path_inside_db; + } + + if (!status.ok()) { + // We failed, remove all files that we copied into the db + for (const auto& f : files_to_import_) { + if (f.internal_file_path.empty()) { + break; + } + const auto s = env_->DeleteFile(f.internal_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } + + return status; +} + +// REQUIRES: we have become the only writer by entering both write_thread_ and +// nonmem_write_thread_ +Status ImportColumnFamilyJob::Run() { + Status status; + edit_.SetColumnFamily(cfd_->GetID()); + + // We use the import time as the ancester time. This is the time the data + // is written to the database. + int64_t temp_current_time = 0; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + uint64_t current_time = kUnknownOldestAncesterTime; + if (env_->GetCurrentTime(&temp_current_time).ok()) { + current_time = oldest_ancester_time = + static_cast(temp_current_time); + } + + for (size_t i = 0; i < files_to_import_.size(); ++i) { + const auto& f = files_to_import_[i]; + const auto& file_metadata = metadata_[i]; + + edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), + f.fd.GetFileSize(), f.smallest_internal_key, + f.largest_internal_key, file_metadata.smallest_seqno, + file_metadata.largest_seqno, false, kInvalidBlobFileNumber, + oldest_ancester_time, current_time); + + // If incoming sequence number is higher, update local sequence number. + if (file_metadata.largest_seqno > versions_->LastSequence()) { + versions_->SetLastAllocatedSequence(file_metadata.largest_seqno); + versions_->SetLastPublishedSequence(file_metadata.largest_seqno); + versions_->SetLastSequence(file_metadata.largest_seqno); + } + } + + return status; +} + +void ImportColumnFamilyJob::Cleanup(const Status& status) { + if (!status.ok()) { + // We failed to add files to the database remove all the files we copied. + for (const auto& f : files_to_import_) { + const auto s = env_->DeleteFile(f.internal_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } + } + } else if (status.ok() && import_options_.move_files) { + // The files were moved and added successfully, remove original file links + for (IngestedFileInfo& f : files_to_import_) { + const auto s = env_->DeleteFile(f.external_file_path); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } + } + } +} + +Status ImportColumnFamilyJob::GetIngestedFileInfo( + const std::string& external_file, IngestedFileInfo* file_to_import, + SuperVersion* sv) { + file_to_import->external_file_path = external_file; + + // Get external file size + auto status = env_->GetFileSize(external_file, &file_to_import->file_size); + if (!status.ok()) { + return status; + } + + // Create TableReader for external file + std::unique_ptr table_reader; + std::unique_ptr sst_file; + std::unique_ptr sst_file_reader; + + status = env_->NewRandomAccessFile(external_file, &sst_file, env_options_); + if (!status.ok()) { + return status; + } + sst_file_reader.reset( + new RandomAccessFileReader(std::move(sst_file), external_file)); + + status = cfd_->ioptions()->table_factory->NewTableReader( + TableReaderOptions(*cfd_->ioptions(), + sv->mutable_cf_options.prefix_extractor.get(), + env_options_, cfd_->internal_comparator()), + std::move(sst_file_reader), file_to_import->file_size, &table_reader); + if (!status.ok()) { + return status; + } + + // Get the external file properties + auto props = table_reader->GetTableProperties(); + + // Set original_seqno to 0. + file_to_import->original_seqno = 0; + + // Get number of entries in table + file_to_import->num_entries = props->num_entries; + + ParsedInternalKey key; + ReadOptions ro; + // During reading the external file we can cache blocks that we read into + // the block cache, if we later change the global seqno of this file, we will + // have block in cache that will include keys with wrong seqno. + // We need to disable fill_cache so that we read from the file without + // updating the block cache. + ro.fill_cache = false; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + + // Get first (smallest) key from file + iter->SeekToFirst(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + file_to_import->smallest_internal_key.SetFrom(key); + + // Get last (largest) key from file + iter->SeekToLast(); + if (!ParseInternalKey(iter->key(), &key)) { + return Status::Corruption("external file have corrupted keys"); + } + file_to_import->largest_internal_key.SetFrom(key); + + file_to_import->cf_id = static_cast(props->column_family_id); + + file_to_import->table_properties = *props; + + return status; +} + +} // namespace rocksdb + +#endif // !ROCKSDB_LITE diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h new file mode 100644 index 00000000000..05796590b61 --- /dev/null +++ b/db/import_column_family_job.h @@ -0,0 +1,70 @@ +#pragma once +#include +#include +#include + +#include "db/column_family.h" +#include "db/dbformat.h" +#include "db/external_sst_file_ingestion_job.h" +#include "db/snapshot_impl.h" +#include "options/db_options.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/metadata.h" +#include "rocksdb/sst_file_writer.h" +#include "util/autovector.h" + +namespace rocksdb { + +// Imports a set of sst files as is into a new column family. Logic is similar +// to ExternalSstFileIngestionJob. +class ImportColumnFamilyJob { + public: + ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, + const EnvOptions& env_options, + const ImportColumnFamilyOptions& import_options, + const std::vector& metadata) + : env_(env), + versions_(versions), + cfd_(cfd), + db_options_(db_options), + env_options_(env_options), + import_options_(import_options), + metadata_(metadata) {} + + // Prepare the job by copying external files into the DB. + Status Prepare(uint64_t next_file_number, SuperVersion* sv); + + // Will execute the import job and prepare edit() to be applied. + // REQUIRES: Mutex held + Status Run(); + + // Cleanup after successful/failed job + void Cleanup(const Status& status); + + VersionEdit* edit() { return &edit_; } + + const autovector& files_to_import() const { + return files_to_import_; + } + + private: + // Open the external file and populate `file_to_import` with all the + // external information we need to import this file. + Status GetIngestedFileInfo(const std::string& external_file, + IngestedFileInfo* file_to_import, + SuperVersion* sv); + + Env* env_; + VersionSet* versions_; + ColumnFamilyData* cfd_; + const ImmutableDBOptions& db_options_; + const EnvOptions& env_options_; + autovector files_to_import_; + VersionEdit edit_; + const ImportColumnFamilyOptions& import_options_; + std::vector metadata_; +}; + +} // namespace rocksdb diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc new file mode 100644 index 00000000000..1138b16bae5 --- /dev/null +++ b/db/import_column_family_test.cc @@ -0,0 +1,567 @@ +#ifndef ROCKSDB_LITE + +#include +#include "db/db_test_util.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/sst_file_writer.h" +#include "test_util/testutil.h" + +namespace rocksdb { + +class ImportColumnFamilyTest : public DBTestBase { + public: + ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") { + sst_files_dir_ = dbname_ + "/sst_files/"; + DestroyAndRecreateExternalSSTFilesDir(); + export_files_dir_ = test::TmpDir(env_) + "/export"; + import_cfh_ = nullptr; + import_cfh2_ = nullptr; + metadata_ptr_ = nullptr; + } + + ~ImportColumnFamilyTest() { + if (import_cfh_) { + db_->DropColumnFamily(import_cfh_); + db_->DestroyColumnFamilyHandle(import_cfh_); + import_cfh_ = nullptr; + } + if (import_cfh2_) { + db_->DropColumnFamily(import_cfh2_); + db_->DestroyColumnFamilyHandle(import_cfh2_); + import_cfh2_ = nullptr; + } + if (metadata_ptr_) { + delete metadata_ptr_; + metadata_ptr_ = nullptr; + } + test::DestroyDir(env_, sst_files_dir_); + test::DestroyDir(env_, export_files_dir_); + } + + void DestroyAndRecreateExternalSSTFilesDir() { + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + test::DestroyDir(env_, export_files_dir_); + } + + LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path, + int level, + SequenceNumber smallest_seqno, + SequenceNumber largest_seqno) { + LiveFileMetaData metadata; + metadata.name = name; + metadata.db_path = path; + metadata.smallest_seqno = smallest_seqno; + metadata.largest_seqno = largest_seqno; + metadata.level = level; + return metadata; + } + + protected: + std::string sst_files_dir_; + std::string export_files_dir_; + ColumnFamilyHandle* import_cfh_; + ColumnFamilyHandle* import_cfh2_; + ExportImportFilesMetaData* metadata_ptr_; +}; + +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + SstFileWriter sfw_unknown(EnvOptions(), options); + + // cf1.sst + const std::string cf1_sst_name = "cf1.sst"; + const std::string cf1_sst = sst_files_dir_ + cf1_sst_name; + ASSERT_OK(sfw_cf1.Open(cf1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + + // cf_unknown.sst + const std::string unknown_sst_name = "cf_unknown.sst"; + const std::string unknown_sst = sst_files_dir_ + unknown_sst_name; + ASSERT_OK(sfw_unknown.Open(unknown_sst)); + ASSERT_OK(sfw_unknown.Put("K3", "V1")); + ASSERT_OK(sfw_unknown.Put("K4", "V2")); + ASSERT_OK(sfw_unknown.Finish()); + + { + // Import sst file corresponding to cf1 onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + std::string value; + db_->Get(ReadOptions(), import_cfh_, "K1", &value); + ASSERT_EQ(value, "V1"); + db_->Get(ReadOptions(), import_cfh_, "K2", &value); + ASSERT_EQ(value, "V2"); + ASSERT_OK(db_->DropColumnFamily(import_cfh_)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; + } + + { + // Import sst file corresponding to unknown cf onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + std::string value; + db_->Get(ReadOptions(), import_cfh_, "K3", &value); + ASSERT_EQ(value, "V1"); + db_->Get(ReadOptions(), import_cfh_, "K4", &value); + ASSERT_EQ(value, "V2"); + } +} + +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + + // file3.sst + const std::string file3_sst_name = "file3.sst"; + const std::string file3_sst = sst_files_dir_ + file3_sst_name; + ASSERT_OK(sfw_cf1.Open(file3_sst)); + for (int i = 0; i < 100; ++i) { + sfw_cf1.Put(Key(i), Key(i) + "_val"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file2.sst + const std::string file2_sst_name = "file2.sst"; + const std::string file2_sst = sst_files_dir_ + file2_sst_name; + ASSERT_OK(sfw_cf1.Open(file2_sst)); + for (int i = 0; i < 100; i += 2) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite1"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file1a.sst + const std::string file1a_sst_name = "file1a.sst"; + const std::string file1a_sst = sst_files_dir_ + file1a_sst_name; + ASSERT_OK(sfw_cf1.Open(file1a_sst)); + for (int i = 0; i < 52; i += 4) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file1b.sst + const std::string file1b_sst_name = "file1b.sst"; + const std::string file1b_sst = sst_files_dir_ + file1b_sst_name; + ASSERT_OK(sfw_cf1.Open(file1b_sst)); + for (int i = 52; i < 100; i += 4) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file0a.sst + const std::string file0a_sst_name = "file0a.sst"; + const std::string file0a_sst = sst_files_dir_ + file0a_sst_name; + ASSERT_OK(sfw_cf1.Open(file0a_sst)); + for (int i = 0; i < 100; i += 16) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite3"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // file0b.sst + const std::string file0b_sst_name = "file0b.sst"; + const std::string file0b_sst = sst_files_dir_ + file0b_sst_name; + ASSERT_OK(sfw_cf1.Open(file0b_sst)); + for (int i = 0; i < 100; i += 16) { + sfw_cf1.Put(Key(i), Key(i) + "_overwrite4"); + } + ASSERT_OK(sfw_cf1.Finish()); + + // Import sst files and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29)); + metadata.files.push_back( + LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34)); + metadata.files.push_back( + LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39)); + metadata.files.push_back( + LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49)); + metadata.files.push_back( + LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } + + for (int i = 0; i < 100; i += 5) { + ASSERT_OK( + db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5")); + } + + // Flush and check again + ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 5 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite5"); + } else if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } + + // Compact and check again. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); + for (int i = 0; i < 100; i++) { + std::string value; + db_->Get(ReadOptions(), import_cfh_, Key(i), &value); + if (i % 5 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite5"); + } else if (i % 16 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite4"); + } else if (i % 4 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite2"); + } else if (i % 2 == 0) { + ASSERT_EQ(value, Key(i) + "_overwrite1"); + } else { + ASSERT_EQ(value, Key(i) + "_val"); + } + } +} + +TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_val"); + } + ASSERT_OK(Flush(1)); + + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_overwrite"); + } + + // Flush to create L0 file. + ASSERT_OK(Flush(1)); + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_overwrite2"); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + ImportColumnFamilyOptions import_options; + import_options.move_files = false; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options, + *metadata_ptr_, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + import_options.move_files = true; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options, + *metadata_ptr_, &import_cfh2_)); + ASSERT_NE(import_cfh2_, nullptr); + delete metadata_ptr_; + metadata_ptr_ = NULL; + + std::string value1, value2; + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Get(1, Key(i)), value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } + + // Modify keys in cf1 and verify. + for (int i = 0; i < 25; i++) { + ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i))); + } + for (int i = 25; i < 50; i++) { + ASSERT_OK( + db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3")); + } + for (int i = 0; i < 25; ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); + } + for (int i = 25; i < 50; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite3", value1); + } + for (int i = 50; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite2", value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } + + // Compact and check again. + ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_)); + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr)); + + for (int i = 0; i < 25; ++i) { + ASSERT_TRUE( + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound()); + } + for (int i = 25; i < 50; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite3", value1); + } + for (int i = 50; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh_, Key(i), &value1); + ASSERT_EQ(Key(i) + "_overwrite2", value1); + } + + for (int i = 0; i < 100; ++i) { + db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2); + ASSERT_EQ(Get(1, Key(i)), value2); + } +} + +TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + Put(1, Key(i), Key(i) + "_val"); + } + ASSERT_OK(Flush(1)); + + // Compact to create a L1 file. + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 50; ++i) { + Put(1, Key(i), Key(i) + "_overwrite"); + } + + // Flush to create L0 file. + ASSERT_OK(Flush(1)); + + for (int i = 0; i < 25; ++i) { + Put(1, Key(i), Key(i) + "_overwrite2"); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + // Create a new db and import the files. + DB* db_copy; + test::DestroyDir(env_, dbname_ + "/db_copy"); + ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); + ColumnFamilyHandle* cfh = nullptr; + ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + *metadata_ptr_, &cfh)); + ASSERT_NE(cfh, nullptr); + + for (int i = 0; i < 100; ++i) { + std::string value; + db_copy->Get(ReadOptions(), cfh, Key(i), &value); + ASSERT_EQ(Get(1, Key(i)), value); + } + db_copy->DropColumnFamily(cfh); + db_copy->DestroyColumnFamilyHandle(cfh); + delete db_copy; + test::DestroyDir(env_, dbname_ + "/db_copy"); +} + +TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + { + // Create column family with existing cf name. + ExportImportFilesMetaData metadata; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Column family already exists")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with no files specified. + ExportImportFilesMetaData metadata; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("The list of files is empty")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with overlapping keys in sst files. + ExportImportFilesMetaData metadata; + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + const std::string file2_sst_name = "file2.sst"; + const std::string file2_sst = sst_files_dir_ + file2_sst_name; + ASSERT_OK(sfw_cf1.Open(file2_sst)); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Put("K3", "V3")); + ASSERT_OK(sfw_cf1.Finish()); + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Files have overlapping ranges")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with a mismatching comparator, should fail with appropriate error. + ExportImportFilesMetaData metadata; + Options mismatch_options = CurrentOptions(); + mismatch_options.comparator = ReverseBytewiseComparator(); + SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Finish()); + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = mismatch_options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::InvalidArgument("Comparator name mismatch")); + ASSERT_EQ(import_cfh_, nullptr); + } + + { + // Import with non existent sst file should fail with appropriate error + ExportImportFilesMetaData metadata; + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + const std::string file1_sst_name = "file1.sst"; + const std::string file1_sst = sst_files_dir_ + file1_sst_name; + ASSERT_OK(sfw_cf1.Open(file1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.Finish()); + const std::string file3_sst_name = "file3.sst"; + + metadata.files.push_back( + LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.files.push_back( + LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_), + Status::IOError("No such file or directory")); + ASSERT_EQ(import_cfh_, nullptr); + + // Test successful import after a failure with the same CF name. Ensures + // there is no side effect with CF when there is a failed import + metadata.files.pop_back(); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else +#include + +int main(int /*argc*/, char** /*argv*/) { + fprintf(stderr, + "SKIPPED as External SST File Writer and Import are not supported " + "in ROCKSDB_LITE\n"); + return 0; +} + +#endif // !ROCKSDB_LITE diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 57c7427e801..94d9cd8ac5b 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -10,20 +10,16 @@ #include "db/internal_stats.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include +#include #include #include #include #include #include "db/column_family.h" -#include "db/db_impl.h" -#include "table/block_based_table_factory.h" +#include "db/db_impl/db_impl.h" +#include "table/block_based/block_based_table_factory.h" #include "util/string_util.h" namespace rocksdb { @@ -958,14 +954,17 @@ void InternalStats::DumpDBStats(std::string* value) { seconds_up, interval_seconds_up); value->append(buf); // Cumulative - uint64_t user_bytes_written = GetDBStats(InternalStats::BYTES_WRITTEN); - uint64_t num_keys_written = GetDBStats(InternalStats::NUMBER_KEYS_WRITTEN); - uint64_t write_other = GetDBStats(InternalStats::WRITE_DONE_BY_OTHER); - uint64_t write_self = GetDBStats(InternalStats::WRITE_DONE_BY_SELF); - uint64_t wal_bytes = GetDBStats(InternalStats::WAL_FILE_BYTES); - uint64_t wal_synced = GetDBStats(InternalStats::WAL_FILE_SYNCED); - uint64_t write_with_wal = GetDBStats(InternalStats::WRITE_WITH_WAL); - uint64_t write_stall_micros = GetDBStats(InternalStats::WRITE_STALL_MICROS); + uint64_t user_bytes_written = + GetDBStats(InternalStats::kIntStatsBytesWritten); + uint64_t num_keys_written = + GetDBStats(InternalStats::kIntStatsNumKeysWritten); + uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther); + uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf); + uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes); + uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced); + uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal); + uint64_t write_stall_micros = + GetDBStats(InternalStats::kIntStatsWriteStallMicros); const int kHumanMicrosLen = 32; char human_micros[kHumanMicrosLen]; diff --git a/db/internal_stats.h b/db/internal_stats.h index 20fb07f4853..24a8d98e6db 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -109,15 +109,15 @@ class InternalStats { }; enum InternalDBStatsType { - WAL_FILE_BYTES, - WAL_FILE_SYNCED, - BYTES_WRITTEN, - NUMBER_KEYS_WRITTEN, - WRITE_DONE_BY_OTHER, - WRITE_DONE_BY_SELF, - WRITE_WITH_WAL, - WRITE_STALL_MICROS, - INTERNAL_DB_STATS_ENUM_MAX, + kIntStatsWalFileBytes, + kIntStatsWalFileSynced, + kIntStatsBytesWritten, + kIntStatsNumKeysWritten, + kIntStatsWriteDoneByOther, + kIntStatsWriteDoneBySelf, + kIntStatsWriteWithWal, + kIntStatsWriteStallMicros, + kIntStatsNumMax, }; InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) @@ -237,6 +237,28 @@ class InternalStats { } } + CompactionStats& operator=(const CompactionStats& c) { + micros = c.micros; + cpu_micros = c.cpu_micros; + bytes_read_non_output_levels = c.bytes_read_non_output_levels; + bytes_read_output_level = c.bytes_read_output_level; + bytes_written = c.bytes_written; + bytes_moved = c.bytes_moved; + num_input_files_in_non_output_levels = + c.num_input_files_in_non_output_levels; + num_input_files_in_output_level = c.num_input_files_in_output_level; + num_output_files = c.num_output_files; + num_input_records = c.num_input_records; + num_dropped_records = c.num_dropped_records; + count = c.count; + + int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); + for (int i = 0; i < num_of_reasons; i++) { + counts[i] = c.counts[i]; + } + return *this; + } + void Clear() { this->micros = 0; this->cpu_micros = 0; @@ -300,7 +322,7 @@ class InternalStats { }; void Clear() { - for (int i = 0; i < INTERNAL_DB_STATS_ENUM_MAX; i++) { + for (int i = 0; i < kIntStatsNumMax; i++) { db_stats_[i].store(0); } for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) { @@ -394,7 +416,7 @@ class InternalStats { bool HandleBlockCacheStat(Cache** block_cache); // Per-DB stats - std::atomic db_stats_[INTERNAL_DB_STATS_ENUM_MAX]; + std::atomic db_stats_[kIntStatsNumMax]; // Per-ColumnFamily stats uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX]; uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX]; @@ -593,15 +615,15 @@ class InternalStats { }; enum InternalDBStatsType { - WAL_FILE_BYTES, - WAL_FILE_SYNCED, - BYTES_WRITTEN, - NUMBER_KEYS_WRITTEN, - WRITE_DONE_BY_OTHER, - WRITE_DONE_BY_SELF, - WRITE_WITH_WAL, - WRITE_STALL_MICROS, - INTERNAL_DB_STATS_ENUM_MAX, + kIntStatsWalFileBytes, + kIntStatsWalFileSynced, + kIntStatsBytesWritten, + kIntStatsNumKeysWritten, + kIntStatsWriteDoneByOther, + kIntStatsWriteDoneBySelf, + kIntStatsWriteWithWal, + kIntStatsWriteStallMicros, + kIntStatsNumMax, }; InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {} diff --git a/db/listener_test.cc b/db/listener_test.cc index 56968d8f803..0e8bae40785 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -3,11 +3,14 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "db/db_impl.h" +#include "db/blob_index.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/dbformat.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "file/filename.h" +#include "logging/logging.h" #include "memtable/hash_linklist_rep.h" #include "monitoring/statistics.h" #include "rocksdb/cache.h" @@ -21,17 +24,15 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" -#include "table/block_based_table_factory.h" -#include "table/plain_table_factory.h" -#include "util/filename.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/plain/plain_table_factory.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/hash.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" #ifndef ROCKSDB_LITE @@ -42,6 +43,14 @@ class EventListenerTest : public DBTestBase { public: EventListenerTest() : DBTestBase("/listener_test") {} + static std::string BlobStr(uint64_t blob_file_number, uint64_t offset, + uint64_t size) { + std::string blob_index; + BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size, + kNoCompression); + return blob_index; + } + const size_t k110KB = 110 << 10; }; @@ -79,11 +88,47 @@ class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory { class TestCompactionListener : public EventListener { public: + explicit TestCompactionListener(EventListenerTest* test) : test_(test) {} + void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override { std::lock_guard lock(mutex_); compacted_dbs_.push_back(db); ASSERT_GT(ci.input_files.size(), 0U); + ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size()); + + for (size_t i = 0; i < ci.input_file_infos.size(); ++i) { + ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level); + ASSERT_EQ(ci.input_file_infos[i].file_number, + TableFileNameToNumber(ci.input_files[i])); + } + ASSERT_GT(ci.output_files.size(), 0U); + ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size()); + + ASSERT_TRUE(test_); + ASSERT_EQ(test_->db_, db); + + std::vector> files_by_level; + test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id], + &files_by_level); + ASSERT_GT(files_by_level.size(), ci.output_level); + + for (size_t i = 0; i < ci.output_file_infos.size(); ++i) { + ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level); + ASSERT_EQ(ci.output_file_infos[i].file_number, + TableFileNameToNumber(ci.output_files[i])); + + auto it = std::find_if( + files_by_level[ci.output_level].begin(), + files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) { + return meta.fd.GetNumber() == ci.output_file_infos[i].file_number; + }); + ASSERT_NE(it, files_by_level[ci.output_level].end()); + + ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number, + it->oldest_blob_file_number); + } + ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id); ASSERT_GT(ci.thread_id, 0U); @@ -98,6 +143,7 @@ class TestCompactionListener : public EventListener { } } + EventListenerTest* test_; std::vector compacted_dbs_; std::mutex mutex_; }; @@ -125,13 +171,19 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { options.table_properties_collector_factories.push_back( std::make_shared()); - TestCompactionListener* listener = new TestCompactionListener(); + TestCompactionListener* listener = new TestCompactionListener(this); options.listeners.emplace_back(listener); std::vector cf_names = { "pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}; CreateAndReopenWithCF(cf_names, options); ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto", + BlobStr(123, 0, 1 << 10))); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); @@ -140,11 +192,9 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); for (int i = 1; i < 8; ++i) { ASSERT_OK(Flush(i)); - const Slice kRangeStart = "a"; - const Slice kRangeEnd = "z"; - ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i], - &kRangeStart, &kRangeEnd)); dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i], + nullptr, nullptr)); dbfull()->TEST_WaitForCompact(); } @@ -157,8 +207,8 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { // This simple Listener can only handle one flush at a time. class TestFlushListener : public EventListener { public: - explicit TestFlushListener(Env* env) - : slowdown_count(0), stop_count(0), db_closed(), env_(env) { + TestFlushListener(Env* env, EventListenerTest* test) + : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) { db_closed = false; } void OnTableFileCreated( @@ -210,6 +260,27 @@ class TestFlushListener : public EventListener { ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name); ASSERT_EQ(prev_fc_info_.job_id, info.job_id); ASSERT_EQ(prev_fc_info_.file_path, info.file_path); + ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number); + + // Note: the following chunk relies on the notification pertaining to the + // database pointed to by DBTestBase::db_, and is thus bypassed when + // that assumption does not hold (see the test case MultiDBMultiListeners + // below). + ASSERT_TRUE(test_); + if (db == test_->db_) { + std::vector> files_by_level; + test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id], + &files_by_level); + + ASSERT_FALSE(files_by_level.empty()); + auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(), + [&](const FileMetaData& meta) { + return meta.fd.GetNumber() == info.file_number; + }); + ASSERT_NE(it, files_by_level[0].end()); + ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number); + } + ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id); ASSERT_GT(info.thread_id, 0U); ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second, @@ -226,6 +297,7 @@ class TestFlushListener : public EventListener { protected: Env* env_; + EventListenerTest* test_; }; TEST_F(EventListenerTest, OnSingleDBFlushTest) { @@ -235,7 +307,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { #ifdef ROCKSDB_USING_THREAD_STATUS options.enable_thread_tracking = true; #endif // ROCKSDB_USING_THREAD_STATUS - TestFlushListener* listener = new TestFlushListener(options.env); + TestFlushListener* listener = new TestFlushListener(options.env, this); options.listeners.emplace_back(listener); std::vector cf_names = { "pikachu", "ilya", "muromec", "dobrynia", @@ -245,6 +317,12 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { CreateAndReopenWithCF(cf_names, options); ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + + WriteBatch batch; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto", + BlobStr(456, 0, 1 << 10))); + ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); @@ -272,7 +350,7 @@ TEST_F(EventListenerTest, MultiCF) { #ifdef ROCKSDB_USING_THREAD_STATUS options.enable_thread_tracking = true; #endif // ROCKSDB_USING_THREAD_STATUS - TestFlushListener* listener = new TestFlushListener(options.env); + TestFlushListener* listener = new TestFlushListener(options.env, this); options.listeners.emplace_back(listener); options.table_properties_collector_factories.push_back( std::make_shared()); @@ -313,7 +391,7 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { const int kNumDBs = 5; const int kNumListeners = 10; for (int i = 0; i < kNumListeners; ++i) { - listeners.emplace_back(new TestFlushListener(options.env)); + listeners.emplace_back(new TestFlushListener(options.env, this)); } std::vector cf_names = { @@ -390,7 +468,7 @@ TEST_F(EventListenerTest, DisableBGCompaction) { #ifdef ROCKSDB_USING_THREAD_STATUS options.enable_thread_tracking = true; #endif // ROCKSDB_USING_THREAD_STATUS - TestFlushListener* listener = new TestFlushListener(options.env); + TestFlushListener* listener = new TestFlushListener(options.env, this); const int kCompactionTrigger = 1; const int kSlowdownTrigger = 5; const int kStopTrigger = 100; diff --git a/db/log_reader.cc b/db/log_reader.cc index e734e9d6c88..3a71cbc4291 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -10,10 +10,11 @@ #include "db/log_reader.h" #include +#include "file/sequence_file_reader.h" #include "rocksdb/env.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/crc32c.h" -#include "util/file_reader_writer.h" #include "util/util.h" namespace rocksdb { diff --git a/db/log_reader.h b/db/log_reader.h index bda9ac8bb35..5f9cb981dba 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -12,13 +12,12 @@ #include #include "db/log_format.h" +#include "file/sequence_file_reader.h" +#include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" -#include "rocksdb/options.h" namespace rocksdb { - -class SequentialFileReader; class Logger; namespace log { @@ -53,6 +52,9 @@ class Reader { // @lint-ignore TXT2 T25377293 Grandfathered in std::unique_ptr&& file, Reporter* reporter, bool checksum, uint64_t log_num); + // No copying allowed + Reader(const Reader&) = delete; + void operator=(const Reader&) = delete; virtual ~Reader(); @@ -148,11 +150,6 @@ class Reader { // buffer_ must be updated to remove the dropped bytes prior to invocation. void ReportCorruption(size_t bytes, const char* reason); void ReportDrop(size_t bytes, const Status& reason); - - private: - // No copying allowed - Reader(const Reader&); - void operator=(const Reader&); }; class FragmentBufferedReader : public Reader { diff --git a/db/log_test.cc b/db/log_test.cc index fd237b030e7..ecfae3e2db3 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -9,13 +9,14 @@ #include "db/log_reader.h" #include "db/log_writer.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" #include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/crc32c.h" -#include "util/file_reader_writer.h" #include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { namespace log { diff --git a/db/log_writer.cc b/db/log_writer.cc index 6ee39198184..53efc6c15b3 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -10,10 +10,10 @@ #include "db/log_writer.h" #include +#include "file/writable_file_writer.h" #include "rocksdb/env.h" #include "util/coding.h" #include "util/crc32c.h" -#include "util/file_reader_writer.h" namespace rocksdb { namespace log { @@ -102,6 +102,13 @@ Status Writer::AddRecord(const Slice& slice) { left -= fragment_length; begin = false; } while (s.ok() && left > 0); + + if (s.ok()) { + if (!manual_flush_) { + s = dest_->Flush(); + } + } + return s; } @@ -146,11 +153,6 @@ Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { Status s = dest_->Append(Slice(buf, header_size)); if (s.ok()) { s = dest_->Append(Slice(ptr, n)); - if (s.ok()) { - if (!manual_flush_) { - s = dest_->Flush(); - } - } } block_offset_ += header_size + n; return s; diff --git a/db/log_writer.h b/db/log_writer.h index 116d033584a..e5ed71a764d 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -73,6 +73,10 @@ class Writer { explicit Writer(std::unique_ptr&& dest, uint64_t log_number, bool recycle_log_files, bool manual_flush = false); + // No copying allowed + Writer(const Writer&) = delete; + void operator=(const Writer&) = delete; + ~Writer(); Status AddRecord(const Slice& slice); @@ -104,10 +108,6 @@ class Writer { // If true, it does not flush after each write. Instead it relies on the upper // layer to manually does the flush by calling ::WriteBuffer() bool manual_flush_; - - // No copying allowed - Writer(const Writer&); - void operator=(const Writer&); }; } // namespace log diff --git a/db/lookup_key.h b/db/lookup_key.h index ddf4ff0e942..1b0f6f56290 100644 --- a/db/lookup_key.h +++ b/db/lookup_key.h @@ -21,7 +21,8 @@ class LookupKey { public: // Initialize *this for looking up user_key at a snapshot with // the specified sequence number. - LookupKey(const Slice& _user_key, SequenceNumber sequence); + LookupKey(const Slice& _user_key, SequenceNumber sequence, + const Slice* ts = nullptr); ~LookupKey(); diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc index bcee5c3fbfe..1dfe0d55b43 100644 --- a/db/malloc_stats.cc +++ b/db/malloc_stats.cc @@ -20,10 +20,6 @@ namespace rocksdb { #ifdef ROCKSDB_JEMALLOC -#ifdef JEMALLOC_NO_RENAME -#define malloc_stats_print je_malloc_stats_print -#endif - typedef struct { char* cur; char* end; diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 02732a55583..1a69a89dea0 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -8,12 +8,12 @@ #include #include -#include "rocksdb/db.h" +#include "port/port.h" #include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" #include "rocksdb/slice.h" #include "rocksdb/write_batch.h" -#include "util/testharness.h" -#include "port/port.h" +#include "test_util/testharness.h" using namespace rocksdb; diff --git a/db/memtable.cc b/db/memtable.cc index 0c706115de0..e3c531c316e 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -10,15 +10,17 @@ #include "db/memtable.h" #include +#include #include #include - #include "db/dbformat.h" #include "db/merge_context.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/range_tombstone_fragmenter.h" #include "db/read_callback.h" +#include "memory/arena.h" +#include "memory/memory_usage.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "port/port.h" @@ -31,10 +33,8 @@ #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" #include "table/merging_iterator.h" -#include "util/arena.h" #include "util/autovector.h" #include "util/coding.h" -#include "util/memory_usage.h" #include "util/mutexlock.h" #include "util/util.h" @@ -105,7 +105,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp, insert_with_hint_prefix_extractor_( ioptions.memtable_insert_with_hint_prefix_extractor), oldest_key_time_(std::numeric_limits::max()), - atomic_flush_seqno_(kMaxSequenceNumber) { + atomic_flush_seqno_(kMaxSequenceNumber), + approximate_memory_usage_(0) { UpdateFlushState(); // something went wrong if we need to flush before inserting anything assert(!ShouldScheduleFlush()); @@ -115,7 +116,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, moptions_.memtable_prefix_bloom_bits > 0) { bloom_filter_.reset( new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, - ioptions.bloom_locality, 6 /* hard coded 6 probes */, + 6 /* hard coded 6 probes */, moptions_.memtable_huge_page_size, ioptions.info_log)); } } @@ -139,11 +140,12 @@ size_t MemTable::ApproximateMemoryUsage() { } total_usage += usage; } + approximate_memory_usage_.store(total_usage, std::memory_order_relaxed); // otherwise, return the actual usage return total_usage; } -bool MemTable::ShouldFlushNow() const { +bool MemTable::ShouldFlushNow() { size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed); // In a lot of times, we cannot allocate arena blocks that exactly matches the // buffer size. Thus we have to decide if we should over-allocate or @@ -159,6 +161,8 @@ bool MemTable::ShouldFlushNow() const { range_del_table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes(); + approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed); + // if we can still allocate one more block without exceeding the // over-allocation ratio, then we should not flush. if (allocated_memory + kArenaBlockSize < @@ -291,6 +295,9 @@ class MemTableIterator : public InternalIterator { iter_ = mem.table_->GetIterator(arena); } } + // No copying allowed + MemTableIterator(const MemTableIterator&) = delete; + void operator=(const MemTableIterator&) = delete; ~MemTableIterator() override { #ifndef NDEBUG @@ -404,10 +411,6 @@ class MemTableIterator : public InternalIterator { bool valid_; bool arena_mode_; bool value_pinned_; - - // No copying allowed - MemTableIterator(const MemTableIterator&); - void operator=(const MemTableIterator&); }; InternalIterator* MemTable::NewIterator(const ReadOptions& read_options, @@ -439,7 +442,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( } port::RWMutex* MemTable::GetLock(const Slice& key) { - return &locks_[static_cast(GetSliceNPHash64(key)) % locks_.size()]; + return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())]; } MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey, @@ -466,7 +469,7 @@ MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey, bool MemTable::Add(SequenceNumber s, ValueType type, const Slice& key, /* user key */ const Slice& value, bool allow_concurrent, - MemTablePostProcessInfo* post_process_info) { + MemTablePostProcessInfo* post_process_info, void** hint) { // Format of an entry is concatenation of: // key_size : varint32 of internal_key.size() // key bytes : char[internal_key.size()] @@ -493,6 +496,8 @@ bool MemTable::Add(SequenceNumber s, ValueType type, p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); + if (!allow_concurrent) { // Extract prefix for insert with hint. if (insert_with_hint_prefix_extractor_ != nullptr && @@ -525,7 +530,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, bloom_filter_->Add(prefix_extractor_->Transform(key)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->Add(key); + bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz)); } // The first sequence number inserted into the memtable @@ -542,7 +547,9 @@ bool MemTable::Add(SequenceNumber s, ValueType type, assert(post_process_info == nullptr); UpdateFlushState(); } else { - bool res = table->InsertKeyConcurrently(handle); + bool res = (hint == nullptr) + ? table->InsertKeyConcurrently(handle) + : table->InsertKeyWithHintConcurrently(handle, hint); if (UNLIKELY(!res)) { return res; } @@ -559,7 +566,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type, bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key)); } if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->AddConcurrently(key); + bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz)); } // atomically update first_seqno_ and earliest_seqno_. @@ -599,6 +606,7 @@ struct Saver { Logger* logger; Statistics* statistics; bool inplace_update_support; + bool do_merge; Env* env_; ReadCallback* callback_; bool* is_blob_index; @@ -625,15 +633,17 @@ static bool SaveValue(void* arg, const char* entry) { // klength varint32 // userkey char[klength-8] // tag uint64 - // vlength varint32 + // vlength varint32f // value char[vlength] // Check that it belongs to same user key. We do not check the // sequence number since the Seek() call above should have skipped // all entries with overly large sequence numbers. uint32_t key_length; const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); - if (s->mem->GetInternalKeyComparator().user_comparator()->Equal( - Slice(key_ptr, key_length - 8), s->key->user_key())) { + Slice user_key_slice = Slice(key_ptr, key_length - 8); + if (s->mem->GetInternalKeyComparator() + .user_comparator() + ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) { // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); ValueType type; @@ -673,12 +683,24 @@ static bool SaveValue(void* arg, const char* entry) { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); *(s->status) = Status::OK(); if (*(s->merge_in_progress)) { - if (s->value != nullptr) { - *(s->status) = MergeHelper::TimedFullMerge( - merge_operator, s->key->user_key(), &v, - merge_context->GetOperands(), s->value, s->logger, - s->statistics, s->env_, nullptr /* result_operand */, true); + if (s->do_merge) { + if (s->value != nullptr) { + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &v, + merge_context->GetOperands(), s->value, s->logger, + s->statistics, s->env_, nullptr /* result_operand */, true); + } + } else { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); } + } else if (!s->do_merge) { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + merge_context->PushOperand( + v, s->inplace_update_support == false /* operand_pinned */); } else if (s->value != nullptr) { s->value->assign(v.data(), v.size()); } @@ -722,7 +744,8 @@ static bool SaveValue(void* arg, const char* entry) { *(s->merge_in_progress) = true; merge_context->PushOperand( v, s->inplace_update_support == false /* operand_pinned */); - if (merge_operator->ShouldMerge(merge_context->GetOperandsDirectionBackward())) { + if (s->do_merge && merge_operator->ShouldMerge( + merge_context->GetOperandsDirectionBackward())) { *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), nullptr, merge_context->GetOperands(), s->value, s->logger, s->statistics, @@ -746,7 +769,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, - ReadCallback* callback, bool* is_blob_index) { + ReadCallback* callback, bool* is_blob_index, bool do_merge) { // The sequence number is updated synchronously in version_set.h if (IsEmpty()) { // Avoiding recording stats for speed. @@ -767,11 +790,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, bool found_final_value = false; bool merge_in_progress = s->IsMergeInProgress(); bool may_contain = true; + size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); if (bloom_filter_) { // when both memtable_whole_key_filtering and prefix_extractor_ are set, // only do whole key filtering for Get() to save CPU if (moptions_.memtable_whole_key_filtering) { - may_contain = bloom_filter_->MayContain(user_key); + may_contain = + bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz)); } else { assert(prefix_extractor_); may_contain = @@ -779,6 +804,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, bloom_filter_->MayContain(prefix_extractor_->Transform(user_key)); } } + if (bloom_filter_ && !may_contain) { // iter is null if prefix bloom says the key does not exist PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); @@ -787,26 +813,9 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, if (bloom_filter_) { PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); } - Saver saver; - saver.status = s; - saver.found_final_value = &found_final_value; - saver.merge_in_progress = &merge_in_progress; - saver.key = &key; - saver.value = value; - saver.seq = kMaxSequenceNumber; - saver.mem = this; - saver.merge_context = merge_context; - saver.max_covering_tombstone_seq = *max_covering_tombstone_seq; - saver.merge_operator = moptions_.merge_operator; - saver.logger = moptions_.info_log; - saver.inplace_update_support = moptions_.inplace_update_support; - saver.statistics = moptions_.statistics; - saver.env_ = env_; - saver.callback_ = callback; - saver.is_blob_index = is_blob_index; - table_->Get(key, &saver, SaveValue); - - *seq = saver.seq; + GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback, + is_blob_index, value, s, merge_context, seq, + &found_final_value, &merge_in_progress); } // No change to value, since we have not yet found a Put/Delete @@ -817,6 +826,103 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, return found_final_value; } +void MemTable::GetFromTable(const LookupKey& key, + SequenceNumber max_covering_tombstone_seq, + bool do_merge, ReadCallback* callback, + bool* is_blob_index, std::string* value, Status* s, + MergeContext* merge_context, SequenceNumber* seq, + bool* found_final_value, bool* merge_in_progress) { + Saver saver; + saver.status = s; + saver.found_final_value = found_final_value; + saver.merge_in_progress = merge_in_progress; + saver.key = &key; + saver.value = value; + saver.seq = kMaxSequenceNumber; + saver.mem = this; + saver.merge_context = merge_context; + saver.max_covering_tombstone_seq = max_covering_tombstone_seq; + saver.merge_operator = moptions_.merge_operator; + saver.logger = moptions_.info_log; + saver.inplace_update_support = moptions_.inplace_update_support; + saver.statistics = moptions_.statistics; + saver.env_ = env_; + saver.callback_ = callback; + saver.is_blob_index = is_blob_index; + saver.do_merge = do_merge; + table_->Get(key, &saver, SaveValue); + *seq = saver.seq; +} + +void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool* is_blob) { + // The sequence number is updated synchronously in version_set.h + if (IsEmpty()) { + // Avoiding recording stats for speed. + return; + } + PERF_TIMER_GUARD(get_from_memtable_time); + + MultiGetRange temp_range(*range, range->begin(), range->end()); + if (bloom_filter_) { + std::array keys; + std::array may_match = {{true}}; + autovector prefixes; + int num_keys = 0; + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + if (!prefix_extractor_) { + keys[num_keys++] = &iter->ukey; + } else if (prefix_extractor_->InDomain(iter->ukey)) { + prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey)); + keys[num_keys++] = &prefixes.back(); + } + } + bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]); + int idx = 0; + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + continue; + } + if (!may_match[idx]) { + temp_range.SkipKey(iter); + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + idx++; + } + } + for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) { + SequenceNumber seq = kMaxSequenceNumber; + bool found_final_value{false}; + bool merge_in_progress = iter->s->IsMergeInProgress(); + std::unique_ptr range_del_iter( + NewRangeTombstoneIterator( + read_options, GetInternalKeySeqno(iter->lkey->internal_key()))); + if (range_del_iter != nullptr) { + iter->max_covering_tombstone_seq = std::max( + iter->max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key())); + } + GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, + callback, is_blob, iter->value->GetSelf(), iter->s, + &(iter->merge_context), &seq, &found_final_value, + &merge_in_progress); + + if (!found_final_value && merge_in_progress) { + *(iter->s) = Status::MergeInProgress(); + } + + if (found_final_value) { + iter->value->PinSelf(); + range->MarkKeyDone(iter); + RecordTick(moptions_.statistics, MEMTABLE_HIT); + } + } + PERF_COUNTER_ADD(get_from_memtable_count, 1); +} + void MemTable::Update(SequenceNumber seq, const Slice& key, const Slice& value) { diff --git a/db/memtable.h b/db/memtable.h index 709e2061e5b..0aeadce80c8 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -19,18 +19,20 @@ #include "db/range_tombstone_fragmenter.h" #include "db/read_callback.h" #include "db/version_edit.h" +#include "memory/allocator.h" +#include "memory/concurrent_arena.h" #include "monitoring/instrumented_mutex.h" #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" -#include "util/allocator.h" -#include "util/concurrent_arena.h" +#include "table/multiget_context.h" #include "util/dynamic_bloom.h" #include "util/hash.h" namespace rocksdb { +struct FlushJobInfo; class Mutex; class MemTableIterator; class MergeContext; @@ -63,6 +65,7 @@ struct MemTablePostProcessInfo { uint64_t num_deletes = 0; }; +using MultiGetRange = MultiGetContext::Range; // Note: Many of the methods in this class have comments indicating that // external synchronization is required as these methods are not thread-safe. // It is up to higher layers of code to decide how to prevent concurrent @@ -101,6 +104,9 @@ class MemTable { const MutableCFOptions& mutable_cf_options, WriteBufferManager* write_buffer_manager, SequenceNumber earliest_seq, uint32_t column_family_id); + // No copying allowed + MemTable(const MemTable&) = delete; + MemTable& operator=(const MemTable&) = delete; // Do not delete this MemTable unless Unref() indicates it not in use. ~MemTable(); @@ -130,6 +136,12 @@ class MemTable { // operations on the same MemTable (unless this Memtable is immutable). size_t ApproximateMemoryUsage(); + // As a cheap version of `ApproximateMemoryUsage()`, this function doens't + // require external synchronization. The value may be less accurate though + size_t ApproximateMemoryUsageFast() { + return approximate_memory_usage_.load(std::memory_order_relaxed); + } + // This method heuristically determines if the memtable should continue to // host more data. bool ShouldScheduleFlush() const { @@ -173,8 +185,13 @@ class MemTable { // the already exists. bool Add(SequenceNumber seq, ValueType type, const Slice& key, const Slice& value, bool allow_concurrent = false, - MemTablePostProcessInfo* post_process_info = nullptr); + MemTablePostProcessInfo* post_process_info = nullptr, + void** hint = nullptr); + // Used to Get value associated with key or Get Merge Operands associated + // with key. + // If do_merge = true the default behavior which is Get value for key is + // executed. Expected behavior is described right below. // If memtable contains a value for key, store it in *value and return true. // If memtable contains a deletion for key, store a NotFound() error // in *status and return true. @@ -188,22 +205,28 @@ class MemTable { // returned). Otherwise, *seq will be set to kMaxSequenceNumber. // On success, *s may be set to OK, NotFound, or MergeInProgress. Any other // status returned indicates a corruption or other unexpected error. + // If do_merge = false then any Merge Operands encountered for key are simply + // stored in merge_context.operands_list and never actually merged to get a + // final value. The raw Merge Operands are eventually returned to the user. bool Get(const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, - bool* is_blob_index = nullptr); + bool* is_blob_index = nullptr, bool do_merge = true); bool Get(const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts, ReadCallback* callback = nullptr, - bool* is_blob_index = nullptr) { + bool* is_blob_index = nullptr, bool do_merge = true) { SequenceNumber seq; return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq, - read_opts, callback, is_blob_index); + read_opts, callback, is_blob_index, do_merge); } + void MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool* is_blob); + // Attempts to update the new_value inplace, else does normal Add // Pseudocode // if key exists in current memtable && prev_value is of type kTypeValue @@ -401,6 +424,16 @@ class MemTable { flush_in_progress_ = in_progress; } +#ifndef ROCKSDB_LITE + void SetFlushJobInfo(std::unique_ptr&& info) { + flush_job_info_ = std::move(info); + } + + std::unique_ptr ReleaseFlushJobInfo() { + return std::move(flush_job_info_); + } +#endif // !ROCKSDB_LITE + private: enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; @@ -479,17 +512,29 @@ class MemTable { // writes with sequence number smaller than seq are flushed. SequenceNumber atomic_flush_seqno_; + // keep track of memory usage in table_, arena_, and range_del_table_. + // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow` + std::atomic approximate_memory_usage_; + +#ifndef ROCKSDB_LITE + // Flush job info of the current memtable. + std::unique_ptr flush_job_info_; +#endif // !ROCKSDB_LITE + // Returns a heuristic flush decision - bool ShouldFlushNow() const; + bool ShouldFlushNow(); // Updates flush_state_ using ShouldFlushNow() void UpdateFlushState(); void UpdateOldestKeyTime(); - // No copying allowed - MemTable(const MemTable&); - MemTable& operator=(const MemTable&); + void GetFromTable(const LookupKey& key, + SequenceNumber max_covering_tombstone_seq, bool do_merge, + ReadCallback* callback, bool* is_blob_index, + std::string* value, Status* s, MergeContext* merge_context, + SequenceNumber* seq, bool* found_final_value, + bool* merge_in_progress); }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 5abe59b3632..d9159b7937f 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -5,26 +5,22 @@ // #include "db/memtable_list.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/memtable.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_set.h" +#include "logging/log_buffer.h" #include "monitoring/thread_status_util.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "table/merging_iterator.h" +#include "test_util/sync_point.h" #include "util/coding.h" -#include "util/log_buffer.h" -#include "util/sync_point.h" namespace rocksdb { @@ -50,6 +46,8 @@ MemTableListVersion::MemTableListVersion( size_t* parent_memtable_list_memory_usage, MemTableListVersion* old) : max_write_buffer_number_to_maintain_( old->max_write_buffer_number_to_maintain_), + max_write_buffer_size_to_maintain_( + old->max_write_buffer_size_to_maintain_), parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) { if (old != nullptr) { memlist_ = old->memlist_; @@ -66,8 +64,10 @@ MemTableListVersion::MemTableListVersion( MemTableListVersion::MemTableListVersion( size_t* parent_memtable_list_memory_usage, - int max_write_buffer_number_to_maintain) + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain), parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {} void MemTableListVersion::Ref() { ++refs_; } @@ -113,6 +113,31 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value, is_blob_index); } +void MemTableListVersion::MultiGet(const ReadOptions& read_options, + MultiGetRange* range, ReadCallback* callback, + bool* is_blob) { + for (auto memtable : memlist_) { + memtable->MultiGet(read_options, range, callback, is_blob); + if (range->empty()) { + return; + } + } +} + +bool MemTableListVersion::GetMergeOperands( + const LookupKey& key, Status* s, MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) { + for (MemTable* memtable : memlist_) { + bool done = memtable->Get(key, nullptr, s, merge_context, + max_covering_tombstone_seq, read_opts, nullptr, + nullptr, false); + if (done) { + return true; + } + } + return false; +} + bool MemTableListVersion::GetFromHistory( const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, @@ -230,7 +255,7 @@ void MemTableListVersion::Add(MemTable* m, autovector* to_delete) { assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable AddMemTable(m); - TrimHistory(to_delete); + TrimHistory(to_delete, m->ApproximateMemoryUsage()); } // Removes m from list of memtables not flushed. Caller should NOT Unref m. @@ -240,19 +265,51 @@ void MemTableListVersion::Remove(MemTable* m, memlist_.remove(m); m->MarkFlushed(); - if (max_write_buffer_number_to_maintain_ > 0) { + if (max_write_buffer_size_to_maintain_ > 0 || + max_write_buffer_number_to_maintain_ > 0) { memlist_history_.push_front(m); - TrimHistory(to_delete); + // Unable to get size of mutable memtable at this point, pass 0 to + // TrimHistory as a best effort. + TrimHistory(to_delete, 0); } else { UnrefMemTable(to_delete, m); } } +// return the total memory usage assuming the oldest flushed memtable is dropped +size_t MemTableListVersion::ApproximateMemoryUsageExcludingLast() { + size_t total_memtable_size = 0; + for (auto& memtable : memlist_) { + total_memtable_size += memtable->ApproximateMemoryUsage(); + } + for (auto& memtable : memlist_history_) { + total_memtable_size += memtable->ApproximateMemoryUsage(); + } + if (!memlist_history_.empty()) { + total_memtable_size -= memlist_history_.back()->ApproximateMemoryUsage(); + } + return total_memtable_size; +} + +bool MemTableListVersion::MemtableLimitExceeded(size_t usage) { + if (max_write_buffer_size_to_maintain_ > 0) { + // calculate the total memory usage after dropping the oldest flushed + // memtable, compare with max_write_buffer_size_to_maintain_ to decide + // whether to trim history + return ApproximateMemoryUsageExcludingLast() + usage >= + static_cast(max_write_buffer_size_to_maintain_); + } else if (max_write_buffer_number_to_maintain_ > 0) { + return memlist_.size() + memlist_history_.size() > + static_cast(max_write_buffer_number_to_maintain_); + } else { + return false; + } +} + // Make sure we don't use up too much space in history -void MemTableListVersion::TrimHistory(autovector* to_delete) { - while (memlist_.size() + memlist_history_.size() > - static_cast(max_write_buffer_number_to_maintain_) && - !memlist_history_.empty()) { +void MemTableListVersion::TrimHistory(autovector* to_delete, + size_t usage) { + while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) { MemTable* x = memlist_history_.back(); memlist_history_.pop_back(); @@ -277,8 +334,12 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id, AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH); const auto& memlist = current_->memlist_; + bool atomic_flush = false; for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { MemTable* m = *it; + if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) { + atomic_flush = true; + } if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) { break; } @@ -292,7 +353,9 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id, ret->push_back(m); } } - flush_requested_ = false; // start-flush request is complete + if (!atomic_flush || num_flush_not_started_ == 0) { + flush_requested_ = false; // start-flush request is complete + } } void MemTableList::RollbackMemtableFlush(const autovector& mems, @@ -322,7 +385,8 @@ Status MemTableList::TryInstallMemtableFlushResults( const autovector& mems, LogsWithPrepTracker* prep_tracker, VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, autovector* to_delete, Directory* db_directory, - LogBuffer* log_buffer) { + LogBuffer* log_buffer, + std::list>* committed_flush_jobs_info) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); mu->AssertHeld(); @@ -380,6 +444,14 @@ Status MemTableList::TryInstallMemtableFlushResults( cfd->GetName().c_str(), m->file_number_); edit_list.push_back(&m->edit_); memtables_to_flush.push_back(m); +#ifndef ROCKSDB_LITE + std::unique_ptr info = m->ReleaseFlushJobInfo(); + if (info != nullptr) { + committed_flush_jobs_info->push_back(std::move(info)); + } +#else + (void)committed_flush_jobs_info; +#endif // !ROCKSDB_LITE } batch_count++; } @@ -428,10 +500,12 @@ Status MemTableList::TryInstallMemtableFlushResults( cfd->GetName().c_str(), m->file_number_, mem_id); assert(m->file_number_ > 0); current_->Remove(m, to_delete); + UpdateMemoryUsageExcludingLast(); + ResetTrimHistoryNeeded(); ++mem_id; } } else { - for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; it++) { + for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) { MemTable* m = *it; // commit failed. setup state so that we can flush again. ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64 @@ -467,6 +541,15 @@ void MemTableList::Add(MemTable* m, autovector* to_delete) { if (num_flush_not_started_ == 1) { imm_flush_needed.store(true, std::memory_order_release); } + UpdateMemoryUsageExcludingLast(); + ResetTrimHistoryNeeded(); +} + +void MemTableList::TrimHistory(autovector* to_delete, size_t usage) { + InstallNewVersion(); + current_->TrimHistory(to_delete, usage); + UpdateMemoryUsageExcludingLast(); + ResetTrimHistoryNeeded(); } // Returns an estimate of the number of bytes of data in use. @@ -480,6 +563,20 @@ size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() { size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; } +size_t MemTableList::ApproximateMemoryUsageExcludingLast() { + size_t usage = + current_memory_usage_excluding_last_.load(std::memory_order_relaxed); + return usage; +} + +// Update current_memory_usage_excluding_last_, need to call whenever state +// changes for MemtableListVersion (whenever InstallNewVersion() is called) +void MemTableList::UpdateMemoryUsageExcludingLast() { + size_t total_memtable_size = current_->ApproximateMemoryUsageExcludingLast(); + current_memory_usage_excluding_last_.store(total_memtable_size, + std::memory_order_relaxed); +} + uint64_t MemTableList::ApproximateOldestKeyTime() const { if (!current_->memlist_.empty()) { return current_->memlist_.back()->ApproximateOldestKeyTime(); @@ -592,7 +689,7 @@ Status InstallMemtableAtomicFlushResults( imm->InstallNewVersion(); } - if (s.ok() || s.IsShutdownInProgress()) { + if (s.ok() || s.IsColumnFamilyDropped()) { for (size_t i = 0; i != cfds.size(); ++i) { if (cfds[i]->IsDropped()) { continue; @@ -607,6 +704,8 @@ Status InstallMemtableAtomicFlushResults( cfds[i]->GetName().c_str(), m->GetFileNumber(), mem_id); imm->current_->Remove(m, to_delete); + imm->UpdateMemoryUsageExcludingLast(); + imm->ResetTrimHistoryNeeded(); } } } else { @@ -632,4 +731,31 @@ Status InstallMemtableAtomicFlushResults( return s; } +void MemTableList::RemoveOldMemTables(uint64_t log_number, + autovector* to_delete) { + assert(to_delete != nullptr); + InstallNewVersion(); + auto& memlist = current_->memlist_; + autovector old_memtables; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* mem = *it; + if (mem->GetNextLogNumber() > log_number) { + break; + } + old_memtables.push_back(mem); + } + + for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) { + MemTable* mem = *it; + current_->Remove(mem, to_delete); + --num_flush_not_started_; + if (0 == num_flush_not_started_) { + imm_flush_needed.store(false, std::memory_order_release); + } + } + + UpdateMemoryUsageExcludingLast(); + ResetTrimHistoryNeeded(); +} + } // namespace rocksdb diff --git a/db/memtable_list.h b/db/memtable_list.h index b56ad4932c4..d78a8b5ea9e 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -16,14 +16,14 @@ #include "db/logs_with_prep_tracker.h" #include "db/memtable.h" #include "db/range_del_aggregator.h" +#include "file/filename.h" +#include "logging/log_buffer.h" #include "monitoring/instrumented_mutex.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/types.h" #include "util/autovector.h" -#include "util/filename.h" -#include "util/log_buffer.h" namespace rocksdb { @@ -33,6 +33,8 @@ class InstrumentedMutex; class MergeIteratorBuilder; class MemTableList; +struct FlushJobInfo; + // keeps a list of immutable memtables in a vector. the list is immutable // if refcount is bigger than one. It is used as a state for Get() and // Iterator code paths @@ -44,7 +46,8 @@ class MemTableListVersion { explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, MemTableListVersion* old = nullptr); explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage, - int max_write_buffer_number_to_maintain); + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain); void Ref(); void Unref(autovector* to_delete = nullptr); @@ -71,6 +74,16 @@ class MemTableListVersion { read_opts, callback, is_blob_index); } + void MultiGet(const ReadOptions& read_options, MultiGetRange* range, + ReadCallback* callback, bool* is_blob); + + // Returns all the merge operands corresponding to the key by searching all + // memtables starting from the most recent one. + bool GetMergeOperands(const LookupKey& key, Status* s, + MergeContext* merge_context, + SequenceNumber* max_covering_tombstone_seq, + const ReadOptions& read_opts); + // Similar to Get(), but searches the Memtable history of memtables that // have already been flushed. Should only be used from in-memory only // queries (such as Transaction validation) as the history may contain @@ -132,7 +145,7 @@ class MemTableListVersion { // REQUIRE: m is an immutable memtable void Remove(MemTable* m, autovector* to_delete); - void TrimHistory(autovector* to_delete); + void TrimHistory(autovector* to_delete, size_t usage); bool GetFromList(std::list* list, const LookupKey& key, std::string* value, Status* s, MergeContext* merge_context, @@ -145,6 +158,14 @@ class MemTableListVersion { void UnrefMemTable(autovector* to_delete, MemTable* m); + // Calculate the total amount of memory used by memlist_ and memlist_history_ + // excluding the last MemTable in memlist_history_. The reason for excluding + // the last MemTable is to see if dropping the last MemTable will keep total + // memory usage above or equal to max_write_buffer_size_to_maintain_ + size_t ApproximateMemoryUsageExcludingLast(); + + bool MemtableLimitExceeded(size_t usage); + // Immutable MemTables that have not yet been flushed. std::list memlist_; @@ -153,8 +174,10 @@ class MemTableListVersion { std::list memlist_history_; // Maximum number of MemTables to keep in memory (including both flushed - // and not-yet-flushed tables). const int max_write_buffer_number_to_maintain_; + // Maximum size of MemTables to keep in memory (including both flushed + // and not-yet-flushed tables). + const int64_t max_write_buffer_size_to_maintain_; int refs_ = 0; @@ -169,35 +192,41 @@ class MemTableListVersion { // recoverability from a crash. // // -// Other than imm_flush_needed, this class is not thread-safe and requires -// external synchronization (such as holding the db mutex or being on the -// write thread.) +// Other than imm_flush_needed and imm_trim_needed, this class is not +// thread-safe and requires external synchronization (such as holding the db +// mutex or being on the write thread.) class MemTableList { public: // A list of memtables. explicit MemTableList(int min_write_buffer_number_to_merge, - int max_write_buffer_number_to_maintain) + int max_write_buffer_number_to_maintain, + int64_t max_write_buffer_size_to_maintain) : imm_flush_needed(false), + imm_trim_needed(false), min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), current_(new MemTableListVersion(¤t_memory_usage_, - max_write_buffer_number_to_maintain)), + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain)), num_flush_not_started_(0), commit_in_progress_(false), - flush_requested_(false) { + flush_requested_(false), + current_memory_usage_(0), + current_memory_usage_excluding_last_(0) { current_->Ref(); - current_memory_usage_ = 0; } // Should not delete MemTableList without making sure MemTableList::current() // is Unref()'d. ~MemTableList() {} - MemTableListVersion* current() { return current_; } + MemTableListVersion* current() const { return current_; } // so that background threads can detect non-nullptr pointer to // determine whether there is anything more to start flushing. std::atomic imm_flush_needed; + std::atomic imm_trim_needed; + // Returns the total number of memtables in the list that haven't yet // been flushed and logged. int NumNotFlushed() const; @@ -227,7 +256,8 @@ class MemTableList { const autovector& m, LogsWithPrepTracker* prep_tracker, VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number, autovector* to_delete, Directory* db_directory, - LogBuffer* log_buffer); + LogBuffer* log_buffer, + std::list>* committed_flush_jobs_info); // New memtables are inserted at the front of the list. // Takes ownership of the referenced held on *m by the caller of Add(). @@ -236,6 +266,18 @@ class MemTableList { // Returns an estimate of the number of bytes of data in use. size_t ApproximateMemoryUsage(); + // Returns the cached current_memory_usage_excluding_last_ value + size_t ApproximateMemoryUsageExcludingLast(); + + // Update current_memory_usage_excluding_last_ from MemtableListVersion + void UpdateMemoryUsageExcludingLast(); + + // `usage` is the current size of the mutable Memtable. When + // max_write_buffer_size_to_maintain is used, total size of mutable and + // immutable memtables is checked against it to decide whether to trim + // memtable list. + void TrimHistory(autovector* to_delete, size_t usage); + // Returns an estimate of the number of bytes of data used by // the unflushed mem-tables. size_t ApproximateUnflushedMemTablesMemoryUsage(); @@ -252,6 +294,20 @@ class MemTableList { bool HasFlushRequested() { return flush_requested_; } + // Returns true if a trim history should be scheduled and the caller should + // be the one to schedule it + bool MarkTrimHistoryNeeded() { + auto expected = false; + return imm_trim_needed.compare_exchange_strong( + expected, true, std::memory_order_relaxed, std::memory_order_relaxed); + } + + void ResetTrimHistoryNeeded() { + auto expected = true; + imm_trim_needed.compare_exchange_strong( + expected, false, std::memory_order_relaxed, std::memory_order_relaxed); + } + // Copying allowed // MemTableList(const MemTableList&); // void operator=(const MemTableList&); @@ -294,6 +350,13 @@ class MemTableList { } } + // Used only by DBImplSecondary during log replay. + // Remove memtables whose data were written before the WAL with log_number + // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are + // not freed, but put into a vector for future deref and reclamation. + void RemoveOldMemTables(uint64_t log_number, + autovector* to_delete); + private: friend Status InstallMemtableAtomicFlushResults( const autovector* imm_lists, @@ -324,6 +387,8 @@ class MemTableList { // The current memory usage. size_t current_memory_usage_; + + std::atomic current_memory_usage_excluding_last_; }; // Installs memtable atomic flush results. diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index a14c13b893b..32a227f4b55 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -13,9 +13,9 @@ #include "rocksdb/db.h" #include "rocksdb/status.h" #include "rocksdb/write_buffer_manager.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -100,7 +100,7 @@ class MemTableListTest : public testing::Test { VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, - &write_controller); + &write_controller, /*block_cache_tracer=*/nullptr); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -117,9 +117,11 @@ class MemTableListTest : public testing::Test { // Create dummy mutex. InstrumentedMutex mutex; InstrumentedMutexLock l(&mutex); - return list->TryInstallMemtableFlushResults( + std::list> flush_jobs_info; + Status s = list->TryInstallMemtableFlushResults( cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex, - file_num, to_delete, nullptr, &log_buffer); + file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info); + return s; } // Calls MemTableList::InstallMemtableFlushResults() and sets up all @@ -144,7 +146,7 @@ class MemTableListTest : public testing::Test { VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, - &write_controller); + &write_controller, /*block_cache_tracer=*/nullptr); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -183,7 +185,7 @@ class MemTableListTest : public testing::Test { TEST_F(MemTableListTest, Empty) { // Create an empty MemTableList and validate basic functions. - MemTableList list(1, 0); + MemTableList list(1, 0, 0); ASSERT_EQ(0, list.NumNotFlushed()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); @@ -202,8 +204,10 @@ TEST_F(MemTableListTest, GetTest) { // Create MemTableList int min_write_buffer_number_to_merge = 2; int max_write_buffer_number_to_maintain = 0; + int64_t max_write_buffer_size_to_maintain = 0; MemTableList list(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain); + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); SequenceNumber seq = 1; std::string value; @@ -312,8 +316,10 @@ TEST_F(MemTableListTest, GetFromHistoryTest) { // Create MemTableList int min_write_buffer_number_to_merge = 2; int max_write_buffer_number_to_maintain = 2; + int64_t max_write_buffer_size_to_maintain = 2000; MemTableList list(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain); + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); SequenceNumber seq = 1; std::string value; @@ -514,8 +520,11 @@ TEST_F(MemTableListTest, FlushPendingTest) { // Create MemTableList int min_write_buffer_number_to_merge = 3; int max_write_buffer_number_to_maintain = 7; + int64_t max_write_buffer_size_to_maintain = + 7 * static_cast(options.write_buffer_size); MemTableList list(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain); + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); // Create some MemTables uint64_t memtable_id = 0; @@ -670,7 +679,9 @@ TEST_F(MemTableListTest, FlushPendingTest) { // created. So TryInstallMemtableFlushResults will install the first 3 tables // in to_flush and stop when it encounters a table not yet flushed. ASSERT_EQ(2, list.NumNotFlushed()); - int num_in_history = std::min(3, max_write_buffer_number_to_maintain); + int num_in_history = + std::min(3, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); ASSERT_EQ(num_in_history, list.NumFlushed()); ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); @@ -687,7 +698,9 @@ TEST_F(MemTableListTest, FlushPendingTest) { // This will actually install 2 tables. The 1 we told it to flush, and also // tables[4] which has been waiting for tables[3] to commit. ASSERT_EQ(0, list.NumNotFlushed()); - num_in_history = std::min(5, max_write_buffer_number_to_maintain); + num_in_history = + std::min(5, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); ASSERT_EQ(num_in_history, list.NumFlushed()); ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size()); @@ -730,7 +743,8 @@ TEST_F(MemTableListTest, FlushPendingTest) { list.current()->Unref(&to_delete); int to_delete_size = - std::min(num_tables, max_write_buffer_number_to_maintain); + std::min(num_tables, static_cast(max_write_buffer_size_to_maintain) / + static_cast(options.write_buffer_size)); ASSERT_EQ(to_delete_size, to_delete.size()); for (const auto& m : to_delete) { @@ -769,10 +783,13 @@ TEST_F(MemTableListTest, AtomicFlusTest) { // Create MemTableLists int min_write_buffer_number_to_merge = 3; int max_write_buffer_number_to_maintain = 7; + int64_t max_write_buffer_size_to_maintain = + 7 * static_cast(options.write_buffer_size); autovector lists; for (int i = 0; i != num_cfs; ++i) { lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain)); + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain)); } autovector cf_ids; diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 4a4d2fb714e..b5ae924ffc6 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -201,7 +201,15 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // want. Also if we're in compaction and it's a put, it would be nice to // run compaction filter on it. const Slice val = iter->value(); - const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr; + const Slice* val_ptr; + if (kTypeValue == ikey.type && + (range_del_agg == nullptr || + !range_del_agg->ShouldDelete( + ikey, RangeDelPositioningMode::kForwardTraversal))) { + val_ptr = &val; + } else { + val_ptr = nullptr; + } std::string merge_result; s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr, merge_context_.GetOperands(), &merge_result, logger_, diff --git a/db/merge_helper_test.cc b/db/merge_helper_test.cc index b61092ee575..3386f9bd067 100644 --- a/db/merge_helper_test.cc +++ b/db/merge_helper_test.cc @@ -9,9 +9,9 @@ #include "db/merge_helper.h" #include "rocksdb/comparator.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/db/merge_test.cc b/db/merge_test.cc index 3bd4b9a6004..2965045d9df 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -7,6 +7,9 @@ #include #include +#include "db/db_impl/db_impl.h" +#include "db/dbformat.h" +#include "db/write_batch_internal.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" @@ -14,11 +17,8 @@ #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" -#include "db/dbformat.h" -#include "db/db_impl.h" -#include "db/write_batch_internal.h" +#include "test_util/testharness.h" #include "utilities/merge_operators.h" -#include "util/testharness.h" namespace rocksdb { diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index 52175a07b74..096a50c1aed 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -13,17 +13,19 @@ #include #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "file/filename.h" +#include "port/stack_trace.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" -#include "util/filename.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" using std::cerr; using std::cout; @@ -32,60 +34,10 @@ using std::flush; namespace rocksdb { -class ObsoleteFilesTest : public testing::Test { +class ObsoleteFilesTest : public DBTestBase { public: - std::string dbname_; - Options options_; - DB* db_; - Env* env_; - int numlevels_; - - ObsoleteFilesTest() { - db_ = nullptr; - env_ = Env::Default(); - // Trigger compaction when the number of level 0 files reaches 2. - options_.level0_file_num_compaction_trigger = 2; - options_.disable_auto_compactions = false; - options_.delete_obsolete_files_period_micros = 0; // always do full purge - options_.enable_thread_tracking = true; - options_.write_buffer_size = 1024*1024*1000; - options_.target_file_size_base = 1024*1024*1000; - options_.max_bytes_for_level_base = 1024*1024*1000; - options_.WAL_ttl_seconds = 300; // Used to test log files - options_.WAL_size_limit_MB = 1024; // Used to test log files - dbname_ = test::PerThreadDBPath("obsolete_files_test"); - options_.wal_dir = dbname_ + "/wal_files"; - - // clean up all the files that might have been there before - std::vector old_files; - env_->GetChildren(dbname_, &old_files); - for (auto file : old_files) { - env_->DeleteFile(dbname_ + "/" + file); - } - env_->GetChildren(options_.wal_dir, &old_files); - for (auto file : old_files) { - env_->DeleteFile(options_.wal_dir + "/" + file); - } - - DestroyDB(dbname_, options_); - numlevels_ = 7; - EXPECT_OK(ReopenDB(true)); - } - - Status ReopenDB(bool create) { - delete db_; - if (create) { - DestroyDB(dbname_, options_); - } - db_ = nullptr; - options_.create_if_missing = create; - return DB::Open(options_, dbname_, &db_); - } - - void CloseDB() { - delete db_; - db_ = nullptr; - } + ObsoleteFilesTest() + : DBTestBase("/obsolete_files_test"), wal_dir_(dbname_ + "/wal_files") {} void AddKeys(int numkeys, int startkey) { WriteOptions options; @@ -98,50 +50,24 @@ class ObsoleteFilesTest : public testing::Test { } } - int numKeysInLevels( - std::vector &metadata, - std::vector *keysperlevel = nullptr) { - - if (keysperlevel != nullptr) { - keysperlevel->resize(numlevels_); - } - - int numKeys = 0; - for (size_t i = 0; i < metadata.size(); i++) { - int startkey = atoi(metadata[i].smallestkey.c_str()); - int endkey = atoi(metadata[i].largestkey.c_str()); - int numkeysinfile = (endkey - startkey + 1); - numKeys += numkeysinfile; - if (keysperlevel != nullptr) { - (*keysperlevel)[(int)metadata[i].level] += numkeysinfile; - } - fprintf(stderr, "level %d name %s smallest %s largest %s\n", - metadata[i].level, metadata[i].name.c_str(), - metadata[i].smallestkey.c_str(), - metadata[i].largestkey.c_str()); - } - return numKeys; - } - void createLevel0Files(int numFiles, int numKeysPerFile) { int startKey = 0; - DBImpl* dbi = reinterpret_cast(db_); for (int i = 0; i < numFiles; i++) { AddKeys(numKeysPerFile, startKey); startKey += numKeysPerFile; - ASSERT_OK(dbi->TEST_FlushMemTable()); - ASSERT_OK(dbi->TEST_WaitForFlushMemTable()); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } } - void CheckFileTypeCounts(std::string& dir, - int required_log, - int required_sst, - int required_manifest) { + void CheckFileTypeCounts(const std::string& dir, int required_log, + int required_sst, int required_manifest) { std::vector filenames; env_->GetChildren(dir, &filenames); - int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0; + int log_cnt = 0; + int sst_cnt = 0; + int manifest_cnt = 0; for (auto file : filenames) { uint64_t number; FileType type; @@ -155,9 +81,31 @@ class ObsoleteFilesTest : public testing::Test { ASSERT_EQ(required_sst, sst_cnt); ASSERT_EQ(required_manifest, manifest_cnt); } + + void ReopenDB() { + Options options = CurrentOptions(); + // Trigger compaction when the number of level 0 files reaches 2. + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = false; + options.delete_obsolete_files_period_micros = 0; // always do full purge + options.enable_thread_tracking = true; + options.write_buffer_size = 1024 * 1024 * 1000; + options.target_file_size_base = 1024 * 1024 * 1000; + options.max_bytes_for_level_base = 1024 * 1024 * 1000; + options.WAL_ttl_seconds = 300; // Used to test log files + options.WAL_size_limit_MB = 1024; // Used to test log files + options.wal_dir = wal_dir_; + Destroy(options); + Reopen(options); + } + + const std::string wal_dir_; }; TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) { + ReopenDB(); + SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->LoadDependency({ {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles", "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"}, @@ -171,35 +119,33 @@ TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) { }); SyncPoint::GetInstance()->SetCallBack( "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) { - std::vector* files_grabbed_for_purge_ptr = - reinterpret_cast*>(arg); + std::unordered_set* files_grabbed_for_purge_ptr = + reinterpret_cast*>(arg); ASSERT_TRUE(files_grabbed_for_purge_ptr->empty()); }); SyncPoint::GetInstance()->EnableProcessing(); createLevel0Files(2, 50000); - CheckFileTypeCounts(options_.wal_dir, 1, 0, 0); + CheckFileTypeCounts(wal_dir_, 1, 0, 0); - DBImpl* dbi = reinterpret_cast(db_); - port::Thread user_thread([&]() { + port::Thread user_thread([this]() { JobContext jobCxt(0); TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"); - dbi->TEST_LockMutex(); - dbi->FindObsoleteFiles(&jobCxt, - true /* force=true */, false /* no_full_scan=false */); - dbi->TEST_UnlockMutex(); + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */, + false /* no_full_scan=false */); + dbfull()->TEST_UnlockMutex(); TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"); - dbi->PurgeObsoleteFiles(jobCxt); + dbfull()->PurgeObsoleteFiles(jobCxt); jobCxt.Clean(); }); user_thread.join(); - - CloseDB(); - SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { + ReopenDB(); + SyncPoint::GetInstance()->DisableProcessing(); std::vector optsfiles_nums; std::vector optsfiles_keep; SyncPoint::GetInstance()->SetCallBack( @@ -213,23 +159,22 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { SyncPoint::GetInstance()->EnableProcessing(); createLevel0Files(2, 50000); - CheckFileTypeCounts(options_.wal_dir, 1, 0, 0); + CheckFileTypeCounts(wal_dir_, 1, 0, 0); - DBImpl* dbi = static_cast(db_); - ASSERT_OK(dbi->DisableFileDeletions()); + ASSERT_OK(dbfull()->DisableFileDeletions()); for (int i = 0; i != 4; ++i) { if (i % 2) { - ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(), - {{"paranoid_file_checks", "false"}})); + ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(), + {{"paranoid_file_checks", "false"}})); } else { - ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(), - {{"paranoid_file_checks", "true"}})); + ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(), + {{"paranoid_file_checks", "true"}})); } } - ASSERT_OK(dbi->EnableFileDeletions(true /* force */)); + ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */)); ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size()); - CloseDB(); + Close(); std::vector files; int opts_file_count = 0; @@ -246,13 +191,22 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { } } ASSERT_EQ(2, opts_file_count); - SyncPoint::GetInstance()->DisableProcessing(); } } //namespace rocksdb +#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS +extern "C" { +void RegisterCustomObjects(int argc, char** argv); +} +#else +void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {} +#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS + int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/options_file_test.cc b/db/options_file_test.cc index 0a9a34ff0b5..b86ecefa97a 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -6,11 +6,11 @@ #ifndef ROCKSDB_LITE #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { class OptionsFileTest : public testing::Test { diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index b7efec182a1..94eabff7ff5 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -17,9 +17,9 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" +#include "test_util/testharness.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/testharness.h" #include "utilities/merge_operators.h" bool FLAGS_random_key = false; diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 2dd0cff0b41..fea1e563cf2 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -12,9 +12,11 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "file/filename.h" +#include "logging/logging.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -22,19 +24,17 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" -#include "table/bloom_block.h" #include "table/meta_blocks.h" -#include "table/plain_table_factory.h" -#include "table/plain_table_key_coding.h" -#include "table/plain_table_reader.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_key_coding.h" +#include "table/plain/plain_table_reader.h" #include "table/table_builder.h" -#include "util/filename.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/hash.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" using std::unique_ptr; @@ -142,6 +142,7 @@ class PlainTableDBTest : public testing::Test, options.prefix_extractor.reset(NewFixedPrefixTransform(8)); options.allow_mmap_reads = mmap_mode_; options.allow_concurrent_memtable_write = false; + options.unordered_write = false; return options; } @@ -301,6 +302,7 @@ class TestPlainTableReader : public PlainTableReader { EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); } } + table_properties_.reset(props); } ~TestPlainTableReader() override {} @@ -391,11 +393,72 @@ class TestPlainTableFactory : public PlainTableFactory { const std::string column_family_name_; }; +TEST_P(PlainTableDBTest, BadOptions1) { + // Build with a prefix extractor + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + // Bad attempt to re-open without a prefix extractor + Options options = CurrentOptions(); + options.prefix_extractor.reset(); + Reopen(&options); + ASSERT_EQ( + "Invalid argument: Prefix extractor is missing when opening a PlainTable " + "built using a prefix extractor", + Get("1000000000000foo")); + + // Bad attempt to re-open with different prefix extractor + options.prefix_extractor.reset(NewFixedPrefixTransform(6)); + Reopen(&options); + ASSERT_EQ( + "Invalid argument: Prefix extractor given doesn't match the one used to " + "build PlainTable", + Get("1000000000000foo")); + + // Correct prefix extractor + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); +} + +TEST_P(PlainTableDBTest, BadOptions2) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(); + options.create_if_missing = true; + DestroyAndReopen(&options); + // Build without a prefix extractor + // (apparently works even if hash_table_ratio > 0) + ASSERT_OK(Put("1000000000000foo", "v1")); + dbfull()->TEST_FlushMemTable(); + + // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor + Status s = TryReopen(&options); + ASSERT_EQ( + "Not implemented: PlainTable requires a prefix extractor enable prefix " + "hash mode.", + s.ToString()); + + // OK to open with hash_table_ratio == 0 and no prefix extractor + PlainTableOptions plain_table_options; + plain_table_options.hash_table_ratio = 0; + options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); + + // OK to open newly with a prefix_extractor and hash table; builds index + // in memory. + options = CurrentOptions(); + Reopen(&options); + ASSERT_EQ("v1", Get("1000000000000foo")); +} + TEST_P(PlainTableDBTest, Flush) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; huge_page_tlb_size += 2 * 1024 * 1024) { for (EncodingType encoding_type : {kPlain, kPrefix}) { - for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int bloom = -1; bloom <= 117; bloom += 117) { + const int bloom_bits = std::max(bloom, 0); + const bool full_scan_mode = bloom < 0; for (int total_order = 0; total_order <= 1; total_order++) { for (int store_index_in_file = 0; store_index_in_file <= 1; ++store_index_in_file) { @@ -413,7 +476,7 @@ TEST_P(PlainTableDBTest, Flush) { plain_table_options.index_sparseness = 2; plain_table_options.huge_page_tlb_size = huge_page_tlb_size; plain_table_options.encoding_type = encoding_type; - plain_table_options.full_scan_mode = false; + plain_table_options.full_scan_mode = full_scan_mode; plain_table_options.store_index_in_file = store_index_in_file; options.table_factory.reset( @@ -426,7 +489,7 @@ TEST_P(PlainTableDBTest, Flush) { plain_table_options.index_sparseness = 16; plain_table_options.huge_page_tlb_size = huge_page_tlb_size; plain_table_options.encoding_type = encoding_type; - plain_table_options.full_scan_mode = false; + plain_table_options.full_scan_mode = full_scan_mode; plain_table_options.store_index_in_file = store_index_in_file; options.table_factory.reset( @@ -453,20 +516,36 @@ TEST_P(PlainTableDBTest, Flush) { auto row = ptc.begin(); auto tp = row->second; - if (!store_index_in_file) { - ASSERT_EQ(total_order ? "4" : "12", - (tp->user_collected_properties) - .at("plain_table_hash_table_size")); - ASSERT_EQ("0", (tp->user_collected_properties) - .at("plain_table_sub_index_size")); + if (full_scan_mode) { + // Does not support Get/Seek + std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("0000000000000bar", iter->key().ToString()); + ASSERT_EQ("v2", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000000foo", iter->key().ToString()); + ASSERT_EQ("v3", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().ok()); } else { - ASSERT_EQ("0", (tp->user_collected_properties) - .at("plain_table_hash_table_size")); - ASSERT_EQ("0", (tp->user_collected_properties) - .at("plain_table_sub_index_size")); + if (!store_index_in_file) { + ASSERT_EQ(total_order ? "4" : "12", + (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } else { + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); } - ASSERT_EQ("v3", Get("1000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); } } } @@ -729,6 +808,64 @@ TEST_P(PlainTableDBTest, Iterator) { } } +namespace { +std::string NthKey(size_t n, char filler) { + std::string rv(16, filler); + rv[0] = n % 10; + rv[1] = (n / 10) % 10; + rv[2] = (n / 100) % 10; + rv[3] = (n / 1000) % 10; + return rv; +} +} // anonymous namespace + +TEST_P(PlainTableDBTest, BloomSchema) { + Options options = CurrentOptions(); + options.create_if_missing = true; + for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) { + options.bloom_locality = bloom_locality; + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = 3; // high FP rate for test + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPlain; + + bool expect_bloom_not_match = false; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */, + kDefaultColumnFamilyName)); + DestroyAndReopen(&options); + + for (unsigned i = 0; i < 2345; ++i) { + ASSERT_OK(Put(NthKey(i, 'y'), "added")); + } + dbfull()->TEST_FlushMemTable(); + ASSERT_EQ("added", Get(NthKey(42, 'y'))); + + for (unsigned i = 0; i < 32; ++i) { + // Known pattern of Bloom filter false positives can detect schema change + // with high probability. Known FPs stuffed into bits: + uint32_t pattern; + if (!bloom_locality) { + pattern = 1785868347UL; + } else if (CACHE_LINE_SIZE == 64U) { + pattern = 2421694657UL; + } else if (CACHE_LINE_SIZE == 128U) { + pattern = 788710956UL; + } else { + ASSERT_EQ(CACHE_LINE_SIZE, 256U); + pattern = 163905UL; + } + bool expect_fp = pattern & (1UL << i); + // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp); + expect_bloom_not_match = !expect_fp; + ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n'))); + } + } +} + namespace { std::string MakeLongKey(size_t length, char c) { return std::string(length, c); diff --git a/db/pre_release_callback.h b/db/pre_release_callback.h index f91ef1b27ac..e4167904ff8 100644 --- a/db/pre_release_callback.h +++ b/db/pre_release_callback.h @@ -27,8 +27,12 @@ class PreReleaseCallback { // is_mem_disabled is currently used for debugging purposes to assert that // the callback is done from the right write queue. // If non-zero, log_number indicates the WAL log to which we wrote. + // index >= 0 specifies the order of callback in the same write thread. + // total > index specifies the total number of callbacks in the same write + // thread. Together with index, could be used to reduce the redundant + // operations among the callbacks. virtual Status Callback(SequenceNumber seq, bool is_mem_disabled, - uint64_t log_number) = 0; + uint64_t log_number, size_t index, size_t total) = 0; }; } // namespace rocksdb diff --git a/db/prefix_test.cc b/db/prefix_test.cc index ac854cb3dbd..19f02f1099a 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -17,7 +17,7 @@ int main() { #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "monitoring/histogram.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -26,12 +26,12 @@ int main() { #include "rocksdb/perf_context.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "test_util/testharness.h" #include "util/coding.h" #include "util/gflags_compat.h" #include "util/random.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/testharness.h" #include "utilities/merge_operators.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; @@ -751,7 +751,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) { for (size_t k = 0; k < 9; k++) { if (rnd.OneIn(2) || it == whole_map.begin()) { iter->Next(); - it++; + ++it; if (FLAGS_enable_print) { std::cout << "Next >> "; } diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 8f86528ecb2..7c188aeaa07 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -5,7 +5,7 @@ #include "db/range_del_aggregator.h" -#include "db/compaction_iteration_stats.h" +#include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" #include "db/range_del_aggregator.h" diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index e593807d548..96cfb581309 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -13,7 +13,7 @@ #include #include -#include "db/compaction_iteration_stats.h" +#include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" #include "db/range_del_aggregator.h" @@ -320,8 +320,10 @@ class RangeDelAggregator { RangeDelPositioningMode mode); void Invalidate() { - InvalidateForwardIter(); - InvalidateReverseIter(); + if (!IsEmpty()) { + InvalidateForwardIter(); + InvalidateReverseIter(); + } } bool IsRangeOverlapped(const Slice& start, const Slice& end); diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc index 34b2f7e5db1..97ba6ca4f8a 100644 --- a/db/range_del_aggregator_bench.cc +++ b/db/range_del_aggregator_bench.cc @@ -23,10 +23,10 @@ int main() { #include "db/range_tombstone_fragmenter.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/random.h" #include "util/stop_watch.h" -#include "util/testutil.h" #include "util/gflags_compat.h" diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 28c8129ecb0..7ce666326a8 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -12,7 +12,7 @@ #include "db/db_test_util.h" #include "db/dbformat.h" #include "db/range_tombstone_fragmenter.h" -#include "util/testutil.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index e3eb18908a5..fe64623fee5 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -9,8 +9,8 @@ #include #include -#include #include +#include #include "util/autovector.h" #include "util/kv_map.h" diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index a0b77b67771..23a28396fd4 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -144,6 +144,8 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { void Invalidate() { pos_ = tombstones_->end(); seq_pos_ = tombstones_->seq_end(); + pinned_pos_ = tombstones_->end(); + pinned_seq_pos_ = tombstones_->seq_end(); } RangeTombstone Tombstone() const { diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc index ddd3f774176..11f3574967d 100644 --- a/db/range_tombstone_fragmenter_test.cc +++ b/db/range_tombstone_fragmenter_test.cc @@ -7,7 +7,7 @@ #include "db/db_test_util.h" #include "rocksdb/comparator.h" -#include "util/testutil.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/db/read_callback.h b/db/read_callback.h index 60f91ef872d..d8801e65173 100644 --- a/db/read_callback.h +++ b/db/read_callback.h @@ -42,9 +42,6 @@ class ReadCallback { // Refresh to a more recent visible seq virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; } - // Refer to DBIter::CanReseekToSkip - virtual bool CanReseekToSkip() { return true; } - protected: // The max visible seq, it is usually the snapshot but could be larger if // transaction has its own writes written to db. diff --git a/db/repair.cc b/db/repair.cc index 2715adcf129..b71f725a285 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -34,6 +34,7 @@ // We scan every table to compute // (1) smallest/largest for the table // (2) largest sequence number in the table +// (3) oldest blob file referred to by the table (if applicable) // // If we are unable to scan the file, then we ignore the table. // @@ -60,13 +61,9 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include "db/builder.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/log_reader.h" #include "db/log_writer.h" @@ -74,6 +71,8 @@ #include "db/table_cache.h" #include "db/version_edit.h" #include "db/write_batch_internal.h" +#include "file/filename.h" +#include "file/writable_file_writer.h" #include "options/cf_options.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -81,8 +80,6 @@ #include "rocksdb/options.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" -#include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/string_util.h" namespace rocksdb { @@ -113,11 +110,13 @@ class Repairer { // once. NewLRUCache(10, db_options_.table_cache_numshardbits)), table_cache_(new TableCache(default_cf_iopts_, env_options_, - raw_table_cache_.get())), + raw_table_cache_.get(), + /*block_cache_tracer=*/nullptr)), wb_(db_options_.db_write_buffer_size), wc_(db_options_.delayed_write_rate), vset_(dbname_, &immutable_db_options_, env_options_, - raw_table_cache_.get(), &wb_, &wc_), + raw_table_cache_.get(), &wb_, &wc_, + /*block_cache_tracer=*/nullptr), next_file_number_(1), db_lock_(nullptr) { for (const auto& cfd : column_families) { @@ -226,8 +225,6 @@ class Repairer { FileMetaData meta; uint32_t column_family_id; std::string column_family_name; - SequenceNumber min_sequence; - SequenceNumber max_sequence; }; std::string const dbname_; @@ -385,7 +382,8 @@ class Repairer { continue; } WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr); + status = + WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr); if (status.ok()) { counter += WriteBatchInternal::Count(&batch); } else { @@ -501,6 +499,7 @@ class Repairer { status = AddColumnFamily(props->column_family_name, t->column_family_id); } + t->meta.oldest_ancester_time = props->creation_time; } ColumnFamilyData* cfd = nullptr; if (status.ok()) { @@ -522,11 +521,12 @@ class Repairer { InternalIterator* iter = table_cache_->NewIterator( ropts, env_options_, cfd->internal_comparator(), t->meta, nullptr /* range_del_agg */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - bool empty = true; + cfd->GetLatestMutableCFOptions()->prefix_extractor.get(), + /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, + TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false, + /*level=*/-1, /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); ParsedInternalKey parsed; - t->min_sequence = 0; - t->max_sequence = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); if (!ParseInternalKey(key, &parsed)) { @@ -537,18 +537,9 @@ class Repairer { } counter++; - if (empty) { - empty = false; - t->meta.smallest.DecodeFrom(key); - t->min_sequence = parsed.sequence; - } - t->meta.largest.DecodeFrom(key); - if (parsed.sequence < t->min_sequence) { - t->min_sequence = parsed.sequence; - } - if (parsed.sequence > t->max_sequence) { - t->max_sequence = parsed.sequence; - } + + t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence, + parsed.type); } if (!iter->status().ok()) { status = iter->status(); @@ -567,8 +558,8 @@ class Repairer { SequenceNumber max_sequence = 0; for (size_t i = 0; i < tables_.size(); i++) { cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]); - if (max_sequence < tables_[i].max_sequence) { - max_sequence = tables_[i].max_sequence; + if (max_sequence < tables_[i].meta.fd.largest_seqno) { + max_sequence = tables_[i].meta.fd.largest_seqno; } } vset_.SetLastAllocatedSequence(max_sequence); @@ -586,10 +577,13 @@ class Repairer { // TODO(opt): separate out into multiple levels for (const auto* table : cf_id_and_tables.second) { - edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), - table->meta.fd.GetFileSize(), table->meta.smallest, - table->meta.largest, table->min_sequence, - table->max_sequence, table->meta.marked_for_compaction); + edit.AddFile( + 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), + table->meta.fd.GetFileSize(), table->meta.smallest, + table->meta.largest, table->meta.fd.smallest_seqno, + table->meta.fd.largest_seqno, table->meta.marked_for_compaction, + table->meta.oldest_blob_file_number, + table->meta.oldest_ancester_time, table->meta.file_creation_time); } assert(next_file_number_ > 0); vset_.MarkFileNumberUsed(next_file_number_ - 1); diff --git a/db/repair_test.cc b/db/repair_test.cc index 3422532da4b..21907e43575 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -9,12 +9,12 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" +#include "file/file_util.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/transaction_log.h" -#include "util/file_util.h" #include "util/string_util.h" namespace rocksdb { diff --git a/db/table_cache.cc b/db/table_cache.cc index 06255d6a354..89acd3d84e6 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -11,21 +11,23 @@ #include "db/dbformat.h" #include "db/range_tombstone_fragmenter.h" +#include "db/snapshot_impl.h" #include "db/version_edit.h" -#include "util/filename.h" - +#include "file/filename.h" +#include "file/random_access_file_reader.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/statistics.h" +#include "table/block_based/block_based_table_reader.h" #include "table/get_context.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" #include "table/multiget_context.h" #include "table/table_builder.h" #include "table/table_reader.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" #include "util/coding.h" -#include "util/file_reader_writer.h" #include "util/stop_watch.h" -#include "util/sync_point.h" namespace rocksdb { @@ -43,13 +45,6 @@ static void UnrefEntry(void* arg1, void* arg2) { cache->Release(h); } -static void DeleteTableReader(void* arg1, void* arg2) { - TableReader* table_reader = reinterpret_cast(arg1); - Statistics* stats = reinterpret_cast(arg2); - RecordTick(stats, NO_FILE_CLOSES); - delete table_reader; -} - static Slice GetSliceForFileNumber(const uint64_t* file_number) { return Slice(reinterpret_cast(file_number), sizeof(*file_number)); @@ -68,11 +63,13 @@ void AppendVarint64(IterKey* key, uint64_t v) { } // namespace TableCache::TableCache(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, Cache* const cache) + const EnvOptions& env_options, Cache* const cache, + BlockCacheTracer* const block_cache_tracer) : ioptions_(ioptions), env_options_(env_options), cache_(cache), - immortal_tables_(false) { + immortal_tables_(false), + block_cache_tracer_(block_cache_tracer) { if (ioptions_.row_cache) { // If the same cache is shared by multiple instances, we need to // disambiguate its entries. @@ -94,10 +91,10 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { Status TableCache::GetTableReader( const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, - bool sequential_mode, size_t readahead, bool record_read_stats, - HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, const SliceTransform* prefix_extractor, bool skip_filters, int level, - bool prefetch_index_and_filter_in_cache, bool for_compaction) { + bool prefetch_index_and_filter_in_cache) { std::string fname = TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId()); std::unique_ptr file; @@ -105,13 +102,6 @@ Status TableCache::GetTableReader( RecordTick(ioptions_.statistics, NO_FILE_OPENS); if (s.ok()) { - if (readahead > 0 && !env_options.use_mmap_reads) { - // Not compatible with mmap files since ReadaheadRandomAccessFile requires - // its wrapped file's Read() to copy data into the provided scratch - // buffer, which mmap files don't use. - // TODO(ajkr): try madvise for mmap files in place of buffered readahead. - file = NewReadaheadRandomAccessFile(std::move(file), readahead); - } if (!sequential_mode && ioptions_.advise_random_on_open) { file->Hint(RandomAccessFile::RANDOM); } @@ -120,12 +110,11 @@ Status TableCache::GetTableReader( new RandomAccessFileReader( std::move(file), fname, ioptions_.env, record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS, - file_read_hist, ioptions_.rate_limiter, for_compaction, - ioptions_.listeners)); + file_read_hist, ioptions_.rate_limiter, ioptions_.listeners)); s = ioptions_.table_factory->NewTableReader( TableReaderOptions(ioptions_, prefix_extractor, env_options, internal_comparator, skip_filters, immortal_tables_, - level, fd.largest_seqno), + level, fd.largest_seqno, block_cache_tracer_), std::move(file_reader), fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); @@ -162,10 +151,9 @@ Status TableCache::FindTable(const EnvOptions& env_options, } std::unique_ptr table_reader; s = GetTableReader(env_options, internal_comparator, fd, - false /* sequential mode */, 0 /* readahead */, - record_read_stats, file_read_hist, &table_reader, - prefix_extractor, skip_filters, level, - prefetch_index_and_filter_in_cache); + false /* sequential mode */, record_read_stats, + file_read_hist, &table_reader, prefix_extractor, + skip_filters, level, prefetch_index_and_filter_in_cache); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.statistics, NO_FILE_ERRORS); @@ -188,54 +176,27 @@ InternalIterator* TableCache::NewIterator( const InternalKeyComparator& icomparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, - bool for_compaction, Arena* arena, bool skip_filters, int level, + TableReaderCaller caller, Arena* arena, bool skip_filters, int level, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key) { PERF_TIMER_GUARD(new_table_iterator_nanos); Status s; - bool create_new_table_reader = false; TableReader* table_reader = nullptr; Cache::Handle* handle = nullptr; if (table_reader_ptr != nullptr) { *table_reader_ptr = nullptr; } - size_t readahead = 0; - if (for_compaction) { -#ifndef NDEBUG - bool use_direct_reads_for_compaction = env_options.use_direct_reads; - TEST_SYNC_POINT_CALLBACK("TableCache::NewIterator:for_compaction", - &use_direct_reads_for_compaction); -#endif // !NDEBUG - if (ioptions_.new_table_reader_for_compaction_inputs) { - // get compaction_readahead_size from env_options allows us to set the - // value dynamically - readahead = env_options.compaction_readahead_size; - create_new_table_reader = true; - } - } - + bool for_compaction = caller == TableReaderCaller::kCompaction; auto& fd = file_meta.fd; - if (create_new_table_reader) { - std::unique_ptr table_reader_unique_ptr; - s = GetTableReader( - env_options, icomparator, fd, true /* sequential_mode */, readahead, - !for_compaction /* record stats */, nullptr, &table_reader_unique_ptr, - prefix_extractor, false /* skip_filters */, level, - true /* prefetch_index_and_filter_in_cache */, for_compaction); + table_reader = fd.table_reader; + if (table_reader == nullptr) { + s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + !for_compaction /* record_read_stats */, file_read_hist, + skip_filters, level); if (s.ok()) { - table_reader = table_reader_unique_ptr.release(); - } - } else { - table_reader = fd.table_reader; - if (table_reader == nullptr) { - s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor, - options.read_tier == kBlockCacheTier /* no_io */, - !for_compaction /* record read_stats */, file_read_hist, - skip_filters, level); - if (s.ok()) { - table_reader = GetTableReaderFromHandle(handle); - } + table_reader = GetTableReaderFromHandle(handle); } } InternalIterator* result = nullptr; @@ -245,13 +206,10 @@ InternalIterator* TableCache::NewIterator( result = NewEmptyInternalIterator(arena); } else { result = table_reader->NewIterator(options, prefix_extractor, arena, - skip_filters, for_compaction); + skip_filters, caller, + env_options.compaction_readahead_size); } - if (create_new_table_reader) { - assert(handle == nullptr); - result->RegisterCleanup(&DeleteTableReader, table_reader, - ioptions_.statistics); - } else if (handle != nullptr) { + if (handle != nullptr) { result->RegisterCleanup(&UnrefEntry, cache_, handle); handle = nullptr; // prevent from releasing below } @@ -296,6 +254,101 @@ InternalIterator* TableCache::NewIterator( return result; } +Status TableCache::GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::unique_ptr* out_iter) { + const FileDescriptor& fd = file_meta.fd; + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + if (t == nullptr) { + s = FindTable(env_options_, internal_comparator, fd, &handle); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + if (s.ok()) { + out_iter->reset(t->NewRangeTombstoneIterator(options)); + assert(out_iter); + } + return s; +} + +#ifndef ROCKSDB_LITE +void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, + IterKey& row_cache_key) { + uint64_t fd_number = fd.GetNumber(); + // We use the user key as cache key instead of the internal key, + // otherwise the whole cache would be invalidated every time the + // sequence key increases. However, to support caching snapshot + // reads, we append the sequence number (incremented by 1 to + // distinguish from 0) only in this case. + // If the snapshot is larger than the largest seqno in the file, + // all data should be exposed to the snapshot, so we treat it + // the same as there is no snapshot. The exception is that if + // a seq-checking callback is registered, some internal keys + // may still be filtered out. + uint64_t seq_no = 0; + // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. + if (options.snapshot != nullptr && + (get_context->has_callback() || + static_cast_with_check( + options.snapshot) + ->GetSequenceNumber() <= fd.largest_seqno)) { + // We should consider to use options.snapshot->GetSequenceNumber() + // instead of GetInternalKeySeqno(k), which will make the code + // easier to understand. + seq_no = 1 + GetInternalKeySeqno(internal_key); + } + + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), + row_cache_id_.size()); + AppendVarint64(&row_cache_key, fd_number); + AppendVarint64(&row_cache_key, seq_no); +} + +bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, + size_t prefix_size, GetContext* get_context) { + bool found = false; + + row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); + if (auto row_handle = + ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) { + // Cleanable routine to release the cache entry + Cleanable value_pinner; + auto release_cache_entry_func = [](void* cache_to_clean, + void* cache_handle) { + ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle); + }; + auto found_row_cache_entry = + static_cast(ioptions_.row_cache->Value(row_handle)); + // If it comes here value is located on the cache. + // found_row_cache_entry points to the value on cache, + // and value_pinner has cleanup procedure for the cached entry. + // After replayGetContextLog() returns, get_context.pinnable_slice_ + // will point to cache entry buffer (or a copy based on that) and + // cleanup routine under value_pinner will be delegated to + // get_context.pinnable_slice_. Cache entry is released when + // get_context.pinnable_slice_ is reset. + value_pinner.RegisterCleanup(release_cache_entry_func, + ioptions_.row_cache.get(), row_handle); + replayGetContextLog(*found_row_cache_entry, user_key, get_context, + &value_pinner); + RecordTick(ioptions_.statistics, ROW_CACHE_HIT); + found = true; + } else { + RecordTick(ioptions_.statistics, ROW_CACHE_MISS); + } + return found; +} +#endif // ROCKSDB_LITE + Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const Slice& k, @@ -313,51 +366,11 @@ Status TableCache::Get(const ReadOptions& options, // Check row cache if enabled. Since row cache does not currently store // sequence numbers, we cannot use it if we need to fetch the sequence. if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { - uint64_t fd_number = fd.GetNumber(); auto user_key = ExtractUserKey(k); - // We use the user key as cache key instead of the internal key, - // otherwise the whole cache would be invalidated every time the - // sequence key increases. However, to support caching snapshot - // reads, we append the sequence number (incremented by 1 to - // distinguish from 0) only in this case. - uint64_t seq_no = - options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k); - - // Compute row cache key. - row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), - row_cache_id_.size()); - AppendVarint64(&row_cache_key, fd_number); - AppendVarint64(&row_cache_key, seq_no); - row_cache_key.TrimAppend(row_cache_key.Size(), user_key.data(), - user_key.size()); - - if (auto row_handle = - ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) { - // Cleanable routine to release the cache entry - Cleanable value_pinner; - auto release_cache_entry_func = [](void* cache_to_clean, - void* cache_handle) { - ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle); - }; - auto found_row_cache_entry = static_cast( - ioptions_.row_cache->Value(row_handle)); - // If it comes here value is located on the cache. - // found_row_cache_entry points to the value on cache, - // and value_pinner has cleanup procedure for the cached entry. - // After replayGetContextLog() returns, get_context.pinnable_slice_ - // will point to cache entry buffer (or a copy based on that) and - // cleanup routine under value_pinner will be delegated to - // get_context.pinnable_slice_. Cache entry is released when - // get_context.pinnable_slice_ is reset. - value_pinner.RegisterCleanup(release_cache_entry_func, - ioptions_.row_cache.get(), row_handle); - replayGetContextLog(*found_row_cache_entry, user_key, get_context, - &value_pinner); - RecordTick(ioptions_.statistics, ROW_CACHE_HIT); - done = true; - } else { - // Not found, setting up the replay log. - RecordTick(ioptions_.statistics, ROW_CACHE_MISS); + CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); + done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), + get_context); + if (!done) { row_cache_entry = &row_cache_entry_buffer; } } @@ -417,8 +430,6 @@ Status TableCache::Get(const ReadOptions& options, } // Batched version of TableCache::MultiGet. -// TODO: Add support for row cache. As of now, this ignores the row cache -// and directly looks up in the table files Status TableCache::MultiGet(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, @@ -430,7 +441,44 @@ Status TableCache::MultiGet(const ReadOptions& options, Status s; TableReader* t = fd.table_reader; Cache::Handle* handle = nullptr; - if (s.ok()) { + MultiGetRange table_range(*mget_range, mget_range->begin(), + mget_range->end()); +#ifndef ROCKSDB_LITE + autovector row_cache_entries; + IterKey row_cache_key; + size_t row_cache_key_prefix_size = 0; + KeyContext& first_key = *table_range.begin(); + bool lookup_row_cache = + ioptions_.row_cache && !first_key.get_context->NeedToReadSequence(); + + // Check row cache if enabled. Since row cache does not currently store + // sequence numbers, we cannot use it if we need to fetch the sequence. + if (lookup_row_cache) { + GetContext* first_context = first_key.get_context; + CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context, + row_cache_key); + row_cache_key_prefix_size = row_cache_key.Size(); + + for (auto miter = table_range.begin(); miter != table_range.end(); + ++miter) { + const Slice& user_key = miter->ukey; + ; + GetContext* get_context = miter->get_context; + + if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size, + get_context)) { + table_range.SkipKey(miter); + } else { + row_cache_entries.emplace_back(); + get_context->SetReplayLog(&(row_cache_entries.back())); + } + } + } +#endif // ROCKSDB_LITE + + // Check that table_range is not empty. Its possible all keys may have been + // found in the row cache and thus the range may now be empty + if (s.ok() && !table_range.empty()) { if (t == nullptr) { s = FindTable( env_options_, internal_comparator, fd, &handle, prefix_extractor, @@ -445,21 +493,20 @@ Status TableCache::MultiGet(const ReadOptions& options, std::unique_ptr range_del_iter( t->NewRangeTombstoneIterator(options)); if (range_del_iter != nullptr) { - for (auto iter = mget_range->begin(); iter != mget_range->end(); + for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { - const Slice& k = iter->ikey; SequenceNumber* max_covering_tombstone_seq = iter->get_context->max_covering_tombstone_seq(); - *max_covering_tombstone_seq = std::max( - *max_covering_tombstone_seq, - range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k))); + *max_covering_tombstone_seq = + std::max(*max_covering_tombstone_seq, + range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey)); } } } if (s.ok()) { - t->MultiGet(options, mget_range, prefix_extractor, skip_filters); + t->MultiGet(options, &table_range, prefix_extractor, skip_filters); } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { - for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) { + for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { Status* status = iter->s; if (status->IsIncomplete()) { // Couldn't find Table in cache but treat as kFound if no_io set @@ -470,6 +517,33 @@ Status TableCache::MultiGet(const ReadOptions& options, } } +#ifndef ROCKSDB_LITE + if (lookup_row_cache) { + size_t row_idx = 0; + + for (auto miter = table_range.begin(); miter != table_range.end(); + ++miter) { + std::string& row_cache_entry = row_cache_entries[row_idx++]; + const Slice& user_key = miter->ukey; + ; + GetContext* get_context = miter->get_context; + + get_context->SetReplayLog(nullptr); + // Compute row cache key. + row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(), + user_key.size()); + // Put the replay log in row cache only if something was found. + if (s.ok() && !row_cache_entry.empty()) { + size_t charge = + row_cache_key.Size() + row_cache_entry.size() + sizeof(std::string); + void* row_ptr = new std::string(std::move(row_cache_entry)); + ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge, + &DeleteEntry); + } + } + } +#endif // ROCKSDB_LITE + if (handle != nullptr) { ReleaseHandle(handle); } @@ -531,4 +605,57 @@ void TableCache::Evict(Cache* cache, uint64_t file_number) { cache->Erase(GetSliceForFileNumber(&file_number)); } +uint64_t TableCache::ApproximateOffsetOf( + const Slice& key, const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = fd.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = FindTable(env_options_, internal_comparator, fd, &table_handle, + prefix_extractor, false /* no_io */, + !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateOffsetOf(key, caller); + } + if (table_handle != nullptr) { + ReleaseHandle(table_handle); + } + + return result; +} + +uint64_t TableCache::ApproximateSize( + const Slice& start, const Slice& end, const FileDescriptor& fd, + TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor) { + uint64_t result = 0; + TableReader* table_reader = fd.table_reader; + Cache::Handle* table_handle = nullptr; + if (table_reader == nullptr) { + const bool for_compaction = (caller == TableReaderCaller::kCompaction); + Status s = FindTable(env_options_, internal_comparator, fd, &table_handle, + prefix_extractor, false /* no_io */, + !for_compaction /* record_read_stats */); + if (s.ok()) { + table_reader = GetTableReaderFromHandle(table_handle); + } + } + + if (table_reader != nullptr) { + result = table_reader->ApproximateSize(start, end, caller); + } + if (table_handle != nullptr) { + ReleaseHandle(table_handle); + } + + return result; +} } // namespace rocksdb diff --git a/db/table_cache.h b/db/table_cache.h index 1e96dfa1bd5..088040672d8 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -23,6 +23,7 @@ #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/table_reader.h" +#include "trace_replay/block_cache_tracer.h" namespace rocksdb { @@ -32,40 +33,56 @@ struct FileDescriptor; class GetContext; class HistogramImpl; +// Manages caching for TableReader objects for a column family. The actual +// cache is allocated separately and passed to the constructor. TableCache +// wraps around the underlying SST file readers by providing Get(), +// MultiGet() and NewIterator() methods that hide the instantiation, +// caching and access to the TableReader. The main purpose of this is +// performance - by caching the TableReader, it avoids unnecessary file opens +// and object allocation and instantiation. One exception is compaction, where +// a new TableReader may be instantiated - see NewIterator() comments +// +// Another service provided by TableCache is managing the row cache - if the +// DB is configured with a row cache, and the lookup key is present in the row +// cache, lookup is very fast. The row cache is obtained from +// ioptions.row_cache class TableCache { public: TableCache(const ImmutableCFOptions& ioptions, - const EnvOptions& storage_options, Cache* cache); + const EnvOptions& storage_options, Cache* cache, + BlockCacheTracer* const block_cache_tracer); ~TableCache(); // Return an iterator for the specified file number (the corresponding - // file length must be exactly "file_size" bytes). If "tableptr" is - // non-nullptr, also sets "*tableptr" to point to the Table object + // file length must be exactly "file_size" bytes). If "table_reader_ptr" + // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object // underlying the returned iterator, or nullptr if no Table object underlies - // the returned iterator. The returned "*tableptr" object is owned by - // the cache and should not be deleted, and is valid for as long as the + // the returned iterator. The returned "*table_reader_ptr" object is owned + // by the cache and should not be deleted, and is valid for as long as the // returned iterator is live. // @param range_del_agg If non-nullptr, adds range deletions to the // aggregator. If an error occurs, returns it in a NewErrorInternalIterator + // @param for_compaction If true, a new TableReader may be allocated (but + // not cached), depending on the CF options // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" InternalIterator* NewIterator( const ReadOptions& options, const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, - const SliceTransform* prefix_extractor = nullptr, - TableReader** table_reader_ptr = nullptr, - HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, - Arena* arena = nullptr, bool skip_filters = false, int level = -1, - const InternalKey* smallest_compaction_key = nullptr, - const InternalKey* largest_compaction_key = nullptr); + const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, + HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena, + bool skip_filters, int level, const InternalKey* smallest_compaction_key, + const InternalKey* largest_compaction_key); // If a seek to internal key "k" in specified file finds an entry, - // call (*handle_result)(arg, found_key, found_value) repeatedly until - // it returns false. - // @param get_context State for get operation. If its range_del_agg() returns - // non-nullptr, adds range deletions to the aggregator. If an error occurs, - // returns non-ok status. + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param get_context Context for get operation. The result of the lookup + // can be retrieved by calling get_context->State() + // @param file_read_hist If non-nullptr, the file reader statistics are + // recorded // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" Status Get(const ReadOptions& options, @@ -76,6 +93,23 @@ class TableCache { HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1); + // Return the range delete tombstone iterator of the file specified by + // `file_meta`. + Status GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::unique_ptr* out_iter); + + // If a seek to internal key "k" in specified file finds an entry, + // call get_context->SaveValue() repeatedly until + // it returns false. As a side effect, it will insert the TableReader + // into the cache and potentially evict another entry + // @param mget_range Pointer to the structure describing a batch of keys to + // be looked up in this table file. The result is stored + // in the embedded GetContext + // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" Status MultiGet(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, @@ -127,6 +161,19 @@ class TableCache { const FileDescriptor& fd, const SliceTransform* prefix_extractor = nullptr); + // Returns approximated offset of a key in a file represented by fd. + uint64_t ApproximateOffsetOf( + const Slice& key, const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor = nullptr); + + // Returns approximated data size between start and end keys in a file + // represented by fd (the start key must not be greater than the end key). + uint64_t ApproximateSize(const Slice& start, const Slice& end, + const FileDescriptor& fd, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const SliceTransform* prefix_extractor = nullptr); + // Release the handle from a cache void ReleaseHandle(Cache::Handle* handle); @@ -149,19 +196,31 @@ class TableCache { Status GetTableReader(const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, bool sequential_mode, - size_t readahead, bool record_read_stats, - HistogramImpl* file_read_hist, + bool record_read_stats, HistogramImpl* file_read_hist, std::unique_ptr* table_reader, const SliceTransform* prefix_extractor = nullptr, bool skip_filters = false, int level = -1, - bool prefetch_index_and_filter_in_cache = true, - bool for_compaction = false); + bool prefetch_index_and_filter_in_cache = true); + + // Create a key prefix for looking up the row cache. The prefix is of the + // format row_cache_id + fd_number + seq_no. Later, the user key can be + // appended to form the full key + void CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, IterKey& row_cache_key); + + // Helper function to lookup the row cache for a key. It appends the + // user key to row_cache_key at offset prefix_size + bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, + size_t prefix_size, GetContext* get_context); const ImmutableCFOptions& ioptions_; const EnvOptions& env_options_; Cache* const cache_; std::string row_cache_id_; bool immortal_tables_; + BlockCacheTracer* const block_cache_tracer_; }; } // namespace rocksdb diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index ea561e982ff..e479fa008bf 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -9,19 +9,20 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/table_properties_collector.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" #include "options/cf_options.h" #include "rocksdb/table.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/meta_blocks.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/table_builder.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" -#include "util/file_reader_writer.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index f92d563eb8e..42c724c0365 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -4,14 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif #include "db/transaction_log_impl.h" -#include +#include #include "db/write_batch_internal.h" -#include "util/file_reader_writer.h" +#include "file/sequence_file_reader.h" namespace rocksdb { @@ -81,19 +78,16 @@ Status TransactionLogIteratorImpl::status() { return current_status_; } bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; } -bool TransactionLogIteratorImpl::RestrictedRead( - Slice* record, - std::string* scratch) { +bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) { // Don't read if no more complete entries to read from logs if (current_last_seq_ >= versions_->LastSequence()) { return false; } - return current_log_reader_->ReadRecord(record, scratch); + return current_log_reader_->ReadRecord(record, &scratch_); } void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index, bool strict) { - std::string scratch; Slice record; started_ = false; is_valid_ = false; @@ -107,7 +101,7 @@ void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index, reporter_.Info(current_status_.ToString().c_str()); return; } - while (RestrictedRead(&record, &scratch)) { + while (RestrictedRead(&record)) { if (record.size() < WriteBatchInternal::kHeader) { reporter_.Corruption( record.size(), Status::Corruption("very small log record")); @@ -158,7 +152,6 @@ void TransactionLogIteratorImpl::Next() { } void TransactionLogIteratorImpl::NextImpl(bool internal) { - std::string scratch; Slice record; is_valid_ = false; if (!internal && !started_) { @@ -170,7 +163,7 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) { if (current_log_reader_->IsEOF()) { current_log_reader_->UnmarkEOF(); } - while (RestrictedRead(&record, &scratch)) { + while (RestrictedRead(&record)) { if (record.size() < WriteBatchInternal::kHeader) { reporter_.Corruption( record.size(), Status::Corruption("very small log record")); @@ -202,7 +195,8 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) { if (current_last_seq_ == versions_->LastSequence()) { current_status_ = Status::OK(); } else { - current_status_ = Status::Corruption("NO MORE DATA LEFT"); + const char* msg = "Create a new iterator to fetch the new tail."; + current_status_ = Status::TryAgain(msg); } return; } diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index 6382b61a5b7..7d6993d1d0a 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -9,13 +9,13 @@ #include "db/log_reader.h" #include "db/version_set.h" +#include "file/filename.h" #include "options/db_options.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/transaction_log.h" #include "rocksdb/types.h" -#include "util/filename.h" namespace rocksdb { @@ -86,6 +86,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { size_t current_file_index_; std::unique_ptr current_batch_; std::unique_ptr current_log_reader_; + std::string scratch_; Status OpenLogFile(const LogFile* log_file, std::unique_ptr* file); @@ -107,7 +108,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { VersionSet const* const versions_; const bool seq_per_batch_; // Reads from transaction log only if the writebatch record has been written - bool RestrictedRead(Slice* record, std::string* scratch); + bool RestrictedRead(Slice* record); // Seeks to startingSequenceNumber reading from startFileIndex in files_. // If strict is set,then must get a batch starting with startingSequenceNumber void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false); diff --git a/db/trim_history_scheduler.cc b/db/trim_history_scheduler.cc new file mode 100644 index 00000000000..a213ac65f2c --- /dev/null +++ b/db/trim_history_scheduler.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/trim_history_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace rocksdb { + +void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) { + std::lock_guard lock(checking_mutex_); + cfd->Ref(); + cfds_.push_back(cfd); + is_empty_.store(false, std::memory_order_relaxed); +} + +ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() { + std::lock_guard lock(checking_mutex_); + while (true) { + if (cfds_.empty()) { + return nullptr; + } + ColumnFamilyData* cfd = cfds_.back(); + cfds_.pop_back(); + if (cfds_.empty()) { + is_empty_.store(true, std::memory_order_relaxed); + } + + if (!cfd->IsDropped()) { + // success + return cfd; + } + if (cfd->Unref()) { + // no longer relevant, retry + delete cfd; + } + } +} + +bool TrimHistoryScheduler::Empty() { + bool is_empty = is_empty_.load(std::memory_order_relaxed); + return is_empty; +} + +void TrimHistoryScheduler::Clear() { + ColumnFamilyData* cfd; + while ((cfd = TakeNextColumnFamily()) != nullptr) { + if (cfd->Unref()) { + delete cfd; + } + } + assert(Empty()); +} + +} // namespace rocksdb diff --git a/db/trim_history_scheduler.h b/db/trim_history_scheduler.h new file mode 100644 index 00000000000..e9013b96470 --- /dev/null +++ b/db/trim_history_scheduler.h @@ -0,0 +1,44 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include "util/autovector.h" + +namespace rocksdb { + +class ColumnFamilyData; + +// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps +// track of column families whose flushed immutable memtables may need to be +// removed (aka trimmed). The actual trimming may be slightly delayed. Due to +// the use of the mutex and atomic variable, ScheduleWork, +// TakeNextColumnFamily, and, Empty can be called concurrently. +class TrimHistoryScheduler { + public: + TrimHistoryScheduler() : is_empty_(true) {} + + // When a column family needs history trimming, add cfd to the FIFO queue + void ScheduleWork(ColumnFamilyData* cfd); + + // Remove the column family from the queue, the caller is responsible for + // calling `MemtableList::TrimHistory` + ColumnFamilyData* TakeNextColumnFamily(); + + bool Empty(); + + void Clear(); + + // Not on critical path, use mutex to ensure thread safety + private: + std::atomic is_empty_; + autovector cfds_; + std::mutex checking_mutex_; +}; + +} // namespace rocksdb diff --git a/db/version_builder.cc b/db/version_builder.cc index 84e4dc6579a..53e25a446a8 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -9,13 +9,9 @@ #include "db/version_builder.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include #include +#include #include #include #include @@ -31,6 +27,7 @@ #include "db/version_set.h" #include "port/port.h" #include "table/table_reader.h" +#include "util/string_util.h" namespace rocksdb { @@ -142,12 +139,12 @@ class VersionBuilder::Rep { } } - void CheckConsistency(VersionStorageInfo* vstorage) { + Status CheckConsistency(VersionStorageInfo* vstorage) { #ifdef NDEBUG if (!vstorage->force_consistency_checks()) { // Dont run consistency checks in release mode except if // explicitly asked to - return; + return Status::OK(); } #endif // make sure the files are sorted correctly @@ -156,10 +153,14 @@ class VersionBuilder::Rep { for (size_t i = 1; i < level_files.size(); i++) { auto f1 = level_files[i - 1]; auto f2 = level_files[i]; +#ifndef NDEBUG + auto pair = std::make_pair(&f1, &f2); + TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency", &pair); +#endif if (level == 0) { if (!level_zero_cmp_(f1, f2)) { fprintf(stderr, "L0 files are not sorted properly"); - abort(); + return Status::Corruption("L0 files are not sorted properly"); } if (f2->fd.smallest_seqno == f2->fd.largest_seqno) { @@ -172,7 +173,13 @@ class VersionBuilder::Rep { " vs. file with global_seqno %" PRIu64 "\n", f1->fd.smallest_seqno, f1->fd.largest_seqno, external_file_seqno); - abort(); + return Status::Corruption( + "L0 file with seqno " + + NumberToString(f1->fd.smallest_seqno) + " " + + NumberToString(f1->fd.largest_seqno) + + " vs. file with global_seqno" + + NumberToString(external_file_seqno) + " with fileNumber " + + NumberToString(f1->fd.GetNumber())); } } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) { fprintf(stderr, @@ -180,12 +187,19 @@ class VersionBuilder::Rep { " %" PRIu64 "\n", f1->fd.smallest_seqno, f1->fd.largest_seqno, f2->fd.smallest_seqno, f2->fd.largest_seqno); - abort(); + return Status::Corruption( + "L0 files seqno " + NumberToString(f1->fd.smallest_seqno) + + " " + NumberToString(f1->fd.largest_seqno) + " " + + NumberToString(f1->fd.GetNumber()) + " vs. " + + NumberToString(f2->fd.smallest_seqno) + " " + + NumberToString(f2->fd.largest_seqno) + " " + + NumberToString(f2->fd.GetNumber())); } } else { if (!level_nonzero_cmp_(f1, f2)) { fprintf(stderr, "L%d files are not sorted properly", level); - abort(); + return Status::Corruption("L" + NumberToString(level) + + " files are not sorted properly"); } // Make sure there is no overlap in levels > 0 @@ -194,20 +208,24 @@ class VersionBuilder::Rep { fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level, (f1->largest).DebugString(true).c_str(), (f2->smallest).DebugString(true).c_str()); - abort(); + return Status::Corruption( + "L" + NumberToString(level) + " have overlapping ranges " + + (f1->largest).DebugString(true) + " vs. " + + (f2->smallest).DebugString(true)); } } } } + return Status::OK(); } - void CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number, - int level) { + Status CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number, + int level) { #ifdef NDEBUG if (!base_vstorage_->force_consistency_checks()) { // Dont run consistency checks in release mode except if // explicitly asked to - return; + return Status::OK(); } #endif // a file to be deleted better exist in the previous version @@ -245,8 +263,9 @@ class VersionBuilder::Rep { } if (!found) { fprintf(stderr, "not found %" PRIu64 "\n", number); - abort(); + return Status::Corruption("not found " + NumberToString(number)); } + return Status::OK(); } bool CheckConsistencyForNumLevels() { @@ -263,8 +282,11 @@ class VersionBuilder::Rep { } // Apply all of the edits in *edit to the current state. - void Apply(VersionEdit* edit) { - CheckConsistency(base_vstorage_); + Status Apply(VersionEdit* edit) { + Status s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } // Delete files const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles(); @@ -281,10 +303,7 @@ class VersionBuilder::Rep { levels_[level].added_files.erase(exising); } } else { - auto exising = invalid_levels_[level].find(number); - if (exising != invalid_levels_[level].end()) { - invalid_levels_[level].erase(exising); - } else { + if (invalid_levels_[level].erase(number) == 0) { // Deleting an non-existing file on invalid level. has_invalid_levels_ = true; } @@ -304,20 +323,29 @@ class VersionBuilder::Rep { levels_[level].added_files[f->fd.GetNumber()] = f; } else { uint64_t number = new_file.second.fd.GetNumber(); - if (invalid_levels_[level].count(number) == 0) { - invalid_levels_[level].insert(number); + auto& lvls = invalid_levels_[level]; + if (lvls.count(number) == 0) { + lvls.insert(number); } else { // Creating an already existing file on invalid level. has_invalid_levels_ = true; } } } + return s; } // Save the current state in *v. - void SaveTo(VersionStorageInfo* vstorage) { - CheckConsistency(base_vstorage_); - CheckConsistency(vstorage); + Status SaveTo(VersionStorageInfo* vstorage) { + Status s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } + + s = CheckConsistency(vstorage); + if (!s.ok()) { + return s; + } for (int level = 0; level < num_levels_; level++) { const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; @@ -361,7 +389,8 @@ class VersionBuilder::Rep { } } - CheckConsistency(vstorage); + s = CheckConsistency(vstorage); + return s; } Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, @@ -479,23 +508,23 @@ VersionBuilder::VersionBuilder(const EnvOptions& env_options, VersionBuilder::~VersionBuilder() { delete rep_; } -void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { - rep_->CheckConsistency(vstorage); +Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { + return rep_->CheckConsistency(vstorage); } -void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit, - uint64_t number, int level) { - rep_->CheckConsistencyForDeletes(edit, number, level); +Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit, + uint64_t number, int level) { + return rep_->CheckConsistencyForDeletes(edit, number, level); } bool VersionBuilder::CheckConsistencyForNumLevels() { return rep_->CheckConsistencyForNumLevels(); } -void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); } +Status VersionBuilder::Apply(VersionEdit* edit) { return rep_->Apply(edit); } -void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { - rep_->SaveTo(vstorage); +Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { + return rep_->SaveTo(vstorage); } Status VersionBuilder::LoadTableHandlers( diff --git a/db/version_builder.h b/db/version_builder.h index 168301fdd61..f5fd121897b 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -27,12 +27,12 @@ class VersionBuilder { VersionBuilder(const EnvOptions& env_options, TableCache* table_cache, VersionStorageInfo* base_vstorage, Logger* info_log = nullptr); ~VersionBuilder(); - void CheckConsistency(VersionStorageInfo* vstorage); - void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, - int level); + Status CheckConsistency(VersionStorageInfo* vstorage); + Status CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, + int level); bool CheckConsistencyForNumLevels(); - void Apply(VersionEdit* edit); - void SaveTo(VersionStorageInfo* vstorage); + Status Apply(VersionEdit* edit); + Status SaveTo(VersionStorageInfo* vstorage); Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 514952bb5b1..64d2d2481eb 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -6,10 +6,10 @@ #include #include "db/version_edit.h" #include "db/version_set.h" -#include "util/logging.h" +#include "logging/logging.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -59,14 +59,12 @@ class VersionBuilderTest : public testing::Test { bool sampled = false, SequenceNumber smallest_seqno = 0, SequenceNumber largest_seqno = 0) { assert(level < vstorage_.num_levels()); - FileMetaData* f = new FileMetaData; - f->fd = FileDescriptor(file_number, path_id, file_size); - f->smallest = GetInternalKey(smallest, smallest_seq); - f->largest = GetInternalKey(largest, largest_seq); - f->fd.smallest_seqno = smallest_seqno; - f->fd.largest_seqno = largest_seqno; + FileMetaData* f = new FileMetaData( + file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, + /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); f->compensated_file_size = file_size; - f->refs = 0; f->num_entries = num_entries; f->num_deletions = num_deletions; vstorage_.AddFile(level, f); @@ -115,7 +113,9 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), - GetInternalKey("350"), 200, 200, false); + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.DeleteFile(3, 27U); EnvOptions env_options; @@ -149,7 +149,9 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { VersionEdit version_edit; version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"), - GetInternalKey("350"), 200, 200, false); + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -186,7 +188,9 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { VersionEdit version_edit; version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"), - GetInternalKey("350"), 200, 200, false); + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -214,15 +218,25 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), - GetInternalKey("350"), 200, 200, false); + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), - GetInternalKey("450"), 200, 200, false); + GetInternalKey("450"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), - GetInternalKey("650"), 200, 200, false); + GetInternalKey("650"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), - GetInternalKey("550"), 200, 200, false); + GetInternalKey("550"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), - GetInternalKey("750"), 200, 200, false); + GetInternalKey("750"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); EnvOptions env_options; @@ -248,24 +262,38 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), - GetInternalKey("350"), 200, 200, false); + GetInternalKey("350"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), - GetInternalKey("450"), 200, 200, false); + GetInternalKey("450"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), - GetInternalKey("650"), 200, 200, false); + GetInternalKey("650"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), - GetInternalKey("550"), 200, 200, false); + GetInternalKey("550"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), - GetInternalKey("750"), 200, 200, false); + GetInternalKey("750"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_builder.Apply(&version_edit); VersionEdit version_edit2; version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"), - GetInternalKey("950"), 200, 200, false); + GetInternalKey("950"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), - GetInternalKey("850"), 200, 200, false); + GetInternalKey("850"), 200, 200, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_builder.Apply(&version_edit2); version_builder.SaveTo(&new_vstorage); diff --git a/db/version_edit.cc b/db/version_edit.cc index 01ec44515a7..dc1d821d975 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -9,17 +9,23 @@ #include "db/version_edit.h" +#include "db/blob_index.h" #include "db/version_set.h" +#include "logging/event_logger.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" -#include "util/event_logger.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { +// Mask for an identified tag from the future which can be safely ignored. +const uint32_t kTagSafeIgnoreMask = 1 << 13; + // Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. +// disk and should not be changed. The number should be forward compatible so +// users can down-grade RocksDB safely. A future Tag is ignored by doing '&' +// between Tag and kTagSafeIgnoreMask field. enum Tag : uint32_t { kComparator = 1, kLogNumber = 2, @@ -31,6 +37,8 @@ enum Tag : uint32_t { // 8 was used for large value refs kPrevLogNumber = 9, kMinLogNumberToKeep = 10, + // Ignore-able field + kDbId = kTagSafeIgnoreMask + 1, // these are new formats divergent from open source leveldb kNewFile2 = 100, @@ -44,9 +52,6 @@ enum Tag : uint32_t { kInAtomicGroup = 300, }; -// Mask for an identified tag from the future which can be safely ignored. -uint32_t kTagSafeIgnoreMask = 1 << 13; - enum CustomTag : uint32_t { kTerminate = 1, // The end of customized fields kNeedCompaction = 2, @@ -55,6 +60,9 @@ enum CustomTag : uint32_t { // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be // removed when manifest becomes forward-comptabile. kMinLogNumberToKeepHack = 3, + kOldestBlobFileNumber = 4, + kOldestAncesterTime = 5, + kFileCreationTime = 6, kPathId = 65, }; // If this bit for the custom tag is set, opening DB should fail if @@ -66,7 +74,51 @@ uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { return number | (path_id * (kFileNumberMask + 1)); } +void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value, + SequenceNumber seqno, + ValueType value_type) { + if (smallest.size() == 0) { + smallest.DecodeFrom(key); + } + largest.DecodeFrom(key); + fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); + fd.largest_seqno = std::max(fd.largest_seqno, seqno); + +#ifndef ROCKSDB_LITE + if (value_type == kTypeBlobIndex) { + BlobIndex blob_index; + const Status s = blob_index.DecodeFrom(value); + if (!s.ok()) { + return; + } + + if (blob_index.IsInlined()) { + return; + } + + if (blob_index.HasTTL()) { + return; + } + + // Paranoid check: this should not happen because BlobDB numbers the blob + // files starting from 1. + if (blob_index.file_number() == kInvalidBlobFileNumber) { + return; + } + + if (oldest_blob_file_number == kInvalidBlobFileNumber || + oldest_blob_file_number > blob_index.file_number()) { + oldest_blob_file_number = blob_index.file_number(); + } + } +#else + (void)value; + (void)value_type; +#endif +} + void VersionEdit::Clear() { + db_id_.clear(); comparator_.clear(); max_level_ = 0; log_number_ = 0; @@ -75,6 +127,7 @@ void VersionEdit::Clear() { next_file_number_ = 0; max_column_family_ = 0; min_log_number_to_keep_ = 0; + has_db_id_ = false; has_comparator_ = false; has_log_number_ = false; has_prev_log_number_ = false; @@ -93,6 +146,10 @@ void VersionEdit::Clear() { } bool VersionEdit::EncodeTo(std::string* dst) const { + if (has_db_id_) { + PutVarint32(dst, kDbId); + PutLengthPrefixedSlice(dst, db_id_); + } if (has_comparator_) { PutVarint32(dst, kComparator); PutLengthPrefixedSlice(dst, comparator_); @@ -123,75 +180,79 @@ bool VersionEdit::EncodeTo(std::string* dst) const { if (!f.smallest.Valid() || !f.largest.Valid()) { return false; } - bool has_customized_fields = false; - if (f.marked_for_compaction || has_min_log_number_to_keep_) { - PutVarint32(dst, kNewFile4); - has_customized_fields = true; - } else if (f.fd.GetPathId() == 0) { - // Use older format to make sure user can roll back the build if they - // don't config multiple DB paths. - PutVarint32(dst, kNewFile2); - } else { - PutVarint32(dst, kNewFile3); - } + PutVarint32(dst, kNewFile4); PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber()); - if (f.fd.GetPathId() != 0 && !has_customized_fields) { - // kNewFile3 - PutVarint32(dst, f.fd.GetPathId()); - } PutVarint64(dst, f.fd.GetFileSize()); PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode()); PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); - if (has_customized_fields) { - // Customized fields' format: - // +-----------------------------+ - // | 1st field's tag (varint32) | - // +-----------------------------+ - // | 1st field's size (varint32) | - // +-----------------------------+ - // | bytes for 1st field | - // | (based on size decoded) | - // +-----------------------------+ - // | | - // | ...... | - // | | - // +-----------------------------+ - // | last field's size (varint32)| - // +-----------------------------+ - // | bytes for last field | - // | (based on size decoded) | - // +-----------------------------+ - // | terminating tag (varint32) | - // +-----------------------------+ - // - // Customized encoding for fields: - // tag kPathId: 1 byte as path_id - // tag kNeedCompaction: - // now only can take one char value 1 indicating need-compaction - // - if (f.fd.GetPathId() != 0) { - PutVarint32(dst, CustomTag::kPathId); - char p = static_cast(f.fd.GetPathId()); - PutLengthPrefixedSlice(dst, Slice(&p, 1)); - } - if (f.marked_for_compaction) { - PutVarint32(dst, CustomTag::kNeedCompaction); - char p = static_cast(1); - PutLengthPrefixedSlice(dst, Slice(&p, 1)); - } - if (has_min_log_number_to_keep_ && !min_log_num_written) { - PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack); - std::string varint_log_number; - PutFixed64(&varint_log_number, min_log_number_to_keep_); - PutLengthPrefixedSlice(dst, Slice(varint_log_number)); - min_log_num_written = true; - } - TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", - dst); - - PutVarint32(dst, CustomTag::kTerminate); + // Customized fields' format: + // +-----------------------------+ + // | 1st field's tag (varint32) | + // +-----------------------------+ + // | 1st field's size (varint32) | + // +-----------------------------+ + // | bytes for 1st field | + // | (based on size decoded) | + // +-----------------------------+ + // | | + // | ...... | + // | | + // +-----------------------------+ + // | last field's size (varint32)| + // +-----------------------------+ + // | bytes for last field | + // | (based on size decoded) | + // +-----------------------------+ + // | terminating tag (varint32) | + // +-----------------------------+ + // + // Customized encoding for fields: + // tag kPathId: 1 byte as path_id + // tag kNeedCompaction: + // now only can take one char value 1 indicating need-compaction + // + PutVarint32(dst, CustomTag::kOldestAncesterTime); + std::string varint_oldest_ancester_time; + PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", + &varint_oldest_ancester_time); + PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); + + PutVarint32(dst, CustomTag::kFileCreationTime); + std::string varint_file_creation_time; + PutVarint64(&varint_file_creation_time, f.file_creation_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime", + &varint_file_creation_time); + PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); + + if (f.fd.GetPathId() != 0) { + PutVarint32(dst, CustomTag::kPathId); + char p = static_cast(f.fd.GetPathId()); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } + if (f.marked_for_compaction) { + PutVarint32(dst, CustomTag::kNeedCompaction); + char p = static_cast(1); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); } + if (has_min_log_number_to_keep_ && !min_log_num_written) { + PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack); + std::string varint_log_number; + PutFixed64(&varint_log_number, min_log_number_to_keep_); + PutLengthPrefixedSlice(dst, Slice(varint_log_number)); + min_log_num_written = true; + } + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + PutVarint32(dst, CustomTag::kOldestBlobFileNumber); + std::string oldest_blob_file_number; + PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); + PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); + } + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", + dst); + + PutVarint32(dst, CustomTag::kTerminate); } // 0 is default and does not need to be explicitly written @@ -278,6 +339,16 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "path_id wrong vaue"; } break; + case kOldestAncesterTime: + if (!GetVarint64(&field, &f.oldest_ancester_time)) { + return "invalid oldest ancester time"; + } + break; + case kFileCreationTime: + if (!GetVarint64(&field, &f.file_creation_time)) { + return "invalid file creation time"; + } + break; case kNeedCompaction: if (field.size() != 1) { return "need_compaction field wrong size"; @@ -292,6 +363,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { } has_min_log_number_to_keep_ = true; break; + case kOldestBlobFileNumber: + if (!GetVarint64(&field, &f.oldest_blob_file_number)) { + return "invalid oldest blob file number"; + } + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -320,9 +396,16 @@ Status VersionEdit::DecodeFrom(const Slice& src) { FileMetaData f; Slice str; InternalKey key; - while (msg == nullptr && GetVarint32(&input, &tag)) { switch (tag) { + case kDbId: + if (GetLengthPrefixedSlice(&input, &str)) { + db_id_ = str.ToString(); + has_db_id_ = true; + } else { + msg = "db id"; + } + break; case kComparator: if (GetLengthPrefixedSlice(&input, &str)) { comparator_ = str.ToString(); @@ -537,6 +620,10 @@ Status VersionEdit::DecodeFrom(const Slice& src) { std::string VersionEdit::DebugString(bool hex_key) const { std::string r; r.append("VersionEdit {"); + if (has_db_id_) { + r.append("\n DB ID: "); + r.append(db_id_); + } if (has_comparator_) { r.append("\n Comparator: "); r.append(comparator_); @@ -581,6 +668,14 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append(f.smallest.DebugString(hex_key)); r.append(" .. "); r.append(f.largest.DebugString(hex_key)); + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + r.append(" blob_file:"); + AppendNumberTo(&r, f.oldest_blob_file_number); + } + r.append(" oldest_ancester_time:"); + AppendNumberTo(&r, f.oldest_ancester_time); + r.append(" file_creation_time:"); + AppendNumberTo(&r, f.file_creation_time); } r.append("\n ColumnFamily: "); AppendNumberTo(&r, column_family_); @@ -608,6 +703,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { JSONWriter jw; jw << "EditNumber" << edit_num; + if (has_db_id_) { + jw << "DB ID" << db_id_; + } if (has_comparator_) { jw << "Comparator" << comparator_; } @@ -652,6 +750,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { jw << "FileSize" << f.fd.GetFileSize(); jw << "SmallestIKey" << f.smallest.DebugString(hex_key); jw << "LargestIKey" << f.largest.DebugString(hex_key); + if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { + jw << "OldestBlobFile" << f.oldest_blob_file_number; + } jw.EndArrayedObject(); } diff --git a/db/version_edit.h b/db/version_edit.h index ee6499cdc3b..5815d18dca6 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -10,19 +10,23 @@ #pragma once #include #include +#include #include #include -#include -#include "rocksdb/cache.h" #include "db/dbformat.h" -#include "util/arena.h" +#include "memory/arena.h" +#include "rocksdb/cache.h" +#include "table/table_reader.h" #include "util/autovector.h" namespace rocksdb { class VersionSet; -const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; +constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; +constexpr uint64_t kInvalidBlobFileNumber = 0; +constexpr uint64_t kUnknownOldestAncesterTime = 0; +constexpr uint64_t kUnknownFileCreationTime = 0; extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); @@ -52,6 +56,8 @@ struct FileDescriptor { smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno) {} + FileDescriptor(const FileDescriptor& fd) { *this = fd; } + FileDescriptor& operator=(const FileDescriptor& fd) { table_reader = fd.table_reader; packed_number_and_path_id = fd.packed_number_and_path_id; @@ -89,7 +95,7 @@ struct FileMetaData { InternalKey largest; // Largest internal key served by table // Needs to be disposed when refs becomes 0. - Cache::Handle* table_reader_handle; + Cache::Handle* table_reader_handle = nullptr; FileSampledStats stats; @@ -98,45 +104,58 @@ struct FileMetaData { // File size compensated by deletion entry. // This is updated in Version::UpdateAccumulatedStats() first time when the // file is created or loaded. After it is updated (!= 0), it is immutable. - uint64_t compensated_file_size; + uint64_t compensated_file_size = 0; // These values can mutate, but they can only be read or written from // single-threaded LogAndApply thread - uint64_t num_entries; // the number of entries. - uint64_t num_deletions; // the number of deletion entries. - uint64_t raw_key_size; // total uncompressed key size. - uint64_t raw_value_size; // total uncompressed value size. - - int refs; // Reference count - - bool being_compacted; // Is this file undergoing compaction? - bool init_stats_from_file; // true if the data-entry stats of this file - // has initialized from file. - - bool marked_for_compaction; // True if client asked us nicely to compact this - // file. - - FileMetaData() - : table_reader_handle(nullptr), - compensated_file_size(0), - num_entries(0), - num_deletions(0), - raw_key_size(0), - raw_value_size(0), - refs(0), - being_compacted(false), - init_stats_from_file(false), - marked_for_compaction(false) {} + uint64_t num_entries = 0; // the number of entries. + uint64_t num_deletions = 0; // the number of deletion entries. + uint64_t raw_key_size = 0; // total uncompressed key size. + uint64_t raw_value_size = 0; // total uncompressed value size. + + int refs = 0; // Reference count + + bool being_compacted = false; // Is this file undergoing compaction? + bool init_stats_from_file = false; // true if the data-entry stats of this + // file has initialized from file. + + bool marked_for_compaction = false; // True if client asked us nicely to + // compact this file. + + // Used only in BlobDB. The file number of the oldest blob file this SST file + // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; + + // The file could be the compaction output from other SST files, which could + // in turn be outputs for compact older SST files. We track the memtable + // flush timestamp for the oldest SST file that eventaully contribute data + // to this file. 0 means the information is not available. + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + + // Unix time when the SST file is created. + uint64_t file_creation_time = kUnknownFileCreationTime; + + FileMetaData() = default; + + FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, + const InternalKey& smallest_key, const InternalKey& largest_key, + const SequenceNumber& smallest_seq, + const SequenceNumber& largest_seq, bool marked_for_compact, + uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, + uint64_t _file_creation_time) + : fd(file, file_path_id, file_size, smallest_seq, largest_seq), + smallest(smallest_key), + largest(largest_key), + marked_for_compaction(marked_for_compact), + oldest_blob_file_number(oldest_blob_file), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time) { + TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); + } // REQUIRED: Keys must be given to the function in sorted order (it expects // the last key to be the largest). - void UpdateBoundaries(const Slice& key, SequenceNumber seqno) { - if (smallest.size() == 0) { - smallest.DecodeFrom(key); - } - largest.DecodeFrom(key); - fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); - fd.largest_seqno = std::max(fd.largest_seqno, seqno); - } + void UpdateBoundaries(const Slice& key, const Slice& value, + SequenceNumber seqno, ValueType value_type); // Unlike UpdateBoundaries, ranges do not need to be presented in any // particular order. @@ -152,6 +171,29 @@ struct FileMetaData { fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); fd.largest_seqno = std::max(fd.largest_seqno, seqno); } + + // Try to get oldest ancester time from the class itself or table properties + // if table reader is already pinned. + // 0 means the information is not available. + uint64_t TryGetOldestAncesterTime() { + if (oldest_ancester_time != kUnknownOldestAncesterTime) { + return oldest_ancester_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->creation_time; + } + return kUnknownOldestAncesterTime; + } + + uint64_t TryGetFileCreationTime() { + if (file_creation_time != kUnknownFileCreationTime) { + return file_creation_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->file_creation_time; + } + return kUnknownFileCreationTime; + } }; // A compressed copy of file meta data that just contain minimum data needed @@ -189,6 +231,10 @@ struct LevelFilesBrief { } }; +// The state of a DB at any given time is referred to as a Version. +// Any modification to the Version is considered a Version Edit. A Version is +// constructed by joining a sequence of Version Edits. Version Edits are written +// to the MANIFEST file. class VersionEdit { public: VersionEdit() { Clear(); } @@ -196,6 +242,11 @@ class VersionEdit { void Clear(); + void SetDBId(const std::string& db_id) { + has_db_id_ = true; + db_id_ = db_id; + } + void SetComparatorName(const Slice& name) { has_comparator_ = true; comparator_ = name.ToString(); @@ -225,6 +276,8 @@ class VersionEdit { min_log_number_to_keep_ = num; } + bool has_db_id() { return has_db_id_; } + bool has_log_number() { return has_log_number_; } uint64_t log_number() { return log_number_; } @@ -236,21 +289,20 @@ class VersionEdit { // Add the specified file at the specified number. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: "smallest" and "largest" are smallest and largest keys in file + // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file + // referred to by this file if any, kInvalidBlobFileNumber otherwise. void AddFile(int level, uint64_t file, uint32_t file_path_id, uint64_t file_size, const InternalKey& smallest, const InternalKey& largest, const SequenceNumber& smallest_seqno, - const SequenceNumber& largest_seqno, - bool marked_for_compaction) { + const SequenceNumber& largest_seqno, bool marked_for_compaction, + uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, + uint64_t file_creation_time) { assert(smallest_seqno <= largest_seqno); - FileMetaData f; - f.fd = FileDescriptor(file, file_path_id, file_size, smallest_seqno, - largest_seqno); - f.smallest = smallest; - f.largest = largest; - f.fd.smallest_seqno = smallest_seqno; - f.fd.largest_seqno = largest_seqno; - f.marked_for_compaction = marked_for_compaction; - new_files_.emplace_back(level, std::move(f)); + new_files_.emplace_back( + level, FileMetaData(file, file_path_id, file_size, smallest, largest, + smallest_seqno, largest_seqno, + marked_for_compaction, oldest_blob_file_number, + oldest_ancester_time, file_creation_time)); } void AddFile(int level, const FileMetaData& f) { @@ -312,14 +364,18 @@ class VersionEdit { std::string DebugString(bool hex_key = false) const; std::string DebugJSON(int edit_num, bool hex_key = false) const; + const std::string GetDbId() { return db_id_; } + private: friend class ReactiveVersionSet; friend class VersionSet; friend class Version; + friend class AtomicGroupReadBuffer; bool GetLevel(Slice* input, int* level, const char** msg); int max_level_; + std::string db_id_; std::string comparator_; uint64_t log_number_; uint64_t prev_log_number_; @@ -328,6 +384,7 @@ class VersionEdit { // The most recent WAL log number that is deleted uint64_t min_log_number_to_keep_; SequenceNumber last_sequence_; + bool has_db_id_; bool has_comparator_; bool has_log_number_; bool has_prev_log_number_; diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 64d1fd77bc1..8a4c1380c1e 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -8,9 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_edit.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" #include "util/coding.h" -#include "util/sync_point.h" -#include "util/testharness.h" namespace rocksdb { @@ -36,7 +36,8 @@ TEST_F(VersionEditTest, EncodeDecode) { edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0, InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("zoo", kBig + 600 + i, kTypeDeletion), - kBig + 500 + i, kBig + 600 + i, false); + kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber, + 888, 678); edit.DeleteFile(4, kBig + 700 + i); } @@ -53,13 +54,20 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { VersionEdit edit; edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, - kBig + 600, true); + kBig + 600, true, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, - kBig + 601, false); + kBig + 601, false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, - kBig + 602, true); + kBig + 602, true, kInvalidBlobFileNumber, 666, 888); + edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), + InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, + kBig + 603, true, 1001, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); + ; edit.DeleteFile(4, 700); @@ -78,9 +86,18 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { ASSERT_TRUE(new_files[0].second.marked_for_compaction); ASSERT_TRUE(!new_files[1].second.marked_for_compaction); ASSERT_TRUE(new_files[2].second.marked_for_compaction); - ASSERT_EQ(3, new_files[0].second.fd.GetPathId()); - ASSERT_EQ(3, new_files[1].second.fd.GetPathId()); - ASSERT_EQ(0, new_files[2].second.fd.GetPathId()); + ASSERT_TRUE(new_files[3].second.marked_for_compaction); + ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); + ASSERT_EQ(3u, new_files[1].second.fd.GetPathId()); + ASSERT_EQ(0u, new_files[2].second.fd.GetPathId()); + ASSERT_EQ(0u, new_files[3].second.fd.GetPathId()); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[0].second.oldest_blob_file_number); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[1].second.oldest_blob_file_number); + ASSERT_EQ(kInvalidBlobFileNumber, + new_files[2].second.oldest_blob_file_number); + ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number); } TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { @@ -88,10 +105,11 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { VersionEdit edit; edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, - kBig + 600, true); + kBig + 600, true, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, - kBig + 601, false); + kBig + 601, false, kInvalidBlobFileNumber, 686, 868); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -127,8 +145,8 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { auto& new_files = parsed.GetNewFiles(); ASSERT_TRUE(new_files[0].second.marked_for_compaction); ASSERT_TRUE(!new_files[1].second.marked_for_compaction); - ASSERT_EQ(3, new_files[0].second.fd.GetPathId()); - ASSERT_EQ(3, new_files[1].second.fd.GetPathId()); + ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); + ASSERT_EQ(3u, new_files[1].second.fd.GetPathId()); ASSERT_EQ(1u, parsed.GetDeletedFiles().size()); } @@ -137,7 +155,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { VersionEdit edit; edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, - kBig + 600, true); + kBig + 600, true, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -164,7 +183,9 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { TEST_F(VersionEditTest, EncodeEmptyFile) { VersionEdit edit; - edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false); + edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } @@ -239,6 +260,16 @@ TEST_F(VersionEditTest, IgnorableField) { ASSERT_EQ(88, ve.next_file_number()); } +TEST_F(VersionEditTest, DbId) { + VersionEdit edit; + edit.SetDBId("ab34-cd12-435f-er00"); + TestEncodeDecode(edit); + + edit.Clear(); + edit.SetDBId("34ba-cd12-435f-er01"); + TestEncodeDecode(edit); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/version_set.cc b/db/version_set.cc index fdc07fee0e5..444996e409d 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -9,21 +9,17 @@ #include "db/version_set.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include #include #include +#include #include #include #include #include #include #include -#include "db/compaction.h" +#include "compaction/compaction.h" #include "db/internal_stats.h" #include "db/log_reader.h" #include "db/log_writer.h" @@ -33,8 +29,13 @@ #include "db/pinned_iterators_manager.h" #include "db/table_cache.h" #include "db/version_builder.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" #include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" +#include "monitoring/persistent_stats_history.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/write_buffer_manager.h" @@ -44,15 +45,13 @@ #include "table/merging_iterator.h" #include "table/meta_blocks.h" #include "table/multiget_context.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/table_reader.h" #include "table/two_level_iterator.h" +#include "test_util/sync_point.h" #include "util/coding.h" -#include "util/file_reader_writer.h" -#include "util/filename.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" #include "util/user_comparator_wrapper.h" namespace rocksdb { @@ -93,7 +92,8 @@ Status OverlapWithIterator(const Comparator* ucmp, return Status::Corruption("DB have corrupted keys"); } - if (ucmp->Compare(seek_result.user_key, largest_user_key) <= 0) { + if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <= + 0) { *overlap = true; } } @@ -171,17 +171,16 @@ class FilePicker { // Check if key is within a file's range. If search left bound and // right bound point to the same find, we are sure key falls in // range. - assert( - curr_level_ == 0 || - curr_index_in_curr_level_ == start_index_in_curr_level_ || - user_comparator_->Compare(user_key_, - ExtractUserKey(f->smallest_key)) <= 0); - - int cmp_smallest = user_comparator_->Compare(user_key_, - ExtractUserKey(f->smallest_key)); + assert(curr_level_ == 0 || + curr_index_in_curr_level_ == start_index_in_curr_level_ || + user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)) <= 0); + + int cmp_smallest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->smallest_key)); if (cmp_smallest >= 0) { - cmp_largest = user_comparator_->Compare(user_key_, - ExtractUserKey(f->largest_key)); + cmp_largest = user_comparator_->CompareWithoutTimestamp( + user_key_, ExtractUserKey(f->largest_key)); } // Setup file search bound for the next level based on the @@ -353,7 +352,7 @@ class FilePickerMultiGet { struct FilePickerContext; public: - FilePickerMultiGet(std::vector* files, MultiGetRange* range, + FilePickerMultiGet(MultiGetRange* range, autovector* file_levels, unsigned int num_levels, FileIndexer* file_indexer, const Comparator* user_comparator, @@ -368,18 +367,12 @@ class FilePickerMultiGet { maybe_repeat_key_(false), current_level_range_(*range, range->begin(), range->end()), current_file_range_(*range, range->begin(), range->end()), -#ifndef NDEBUG - files_(files), -#endif level_files_brief_(file_levels), is_hit_file_last_in_level_(false), curr_file_level_(nullptr), file_indexer_(file_indexer), user_comparator_(user_comparator), internal_comparator_(internal_comparator) { -#ifdef NDEBUG - (void)files; -#endif for (auto iter = range_->begin(); iter != range_->end(); ++iter) { fp_ctx_array_[iter.index()] = FilePickerContext(0, FileIndexer::kLevelMaxIndex); @@ -416,6 +409,18 @@ class FilePickerMultiGet { bool file_hit = false; int cmp_largest = -1; if (curr_file_index >= curr_file_level_->num_files) { + // In the unlikely case the next key is a duplicate of the current key, + // and the current key is the last in the level and the internal key + // was not found, we need to skip lookup for the remaining keys and + // reset the search bounds + if (batch_iter_ != current_level_range_.end()) { + ++batch_iter_; + for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) { + struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()]; + fp_ctx.search_left_bound = 0; + fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex; + } + } return false; } // Loops over keys in the MultiGet batch until it finds a file with @@ -473,25 +478,6 @@ class FilePickerMultiGet { } else { file_hit = true; } -#ifndef NDEBUG - // Sanity check to make sure that the files are correctly sorted - if (f != prev_file_) { - if (prev_file_) { - if (curr_level_ != 0) { - int comp_sign = internal_comparator_->Compare( - prev_file_->largest_key, f->smallest_key); - assert(comp_sign < 0); - } else if (fp_ctx.curr_index_in_curr_level > 0) { - // level == 0, the current file cannot be newer than the previous - // one. Use compressed data structure, has no attribute seqNo - assert(!NewestFirstBySeqNo( - files_[0][fp_ctx.curr_index_in_curr_level], - files_[0][fp_ctx.curr_index_in_curr_level - 1])); - } - } - prev_file_ = f; - } -#endif if (cmp_largest == 0) { // cmp_largest is 0, which means the next key will not be in this // file, so stop looking further. Also don't increment megt_iter_ @@ -533,7 +519,10 @@ class FilePickerMultiGet { // any further for that key, so advance batch_iter_. Else, keep // batch_iter_ positioned on that key so we look it up again in // the next file - if (current_level_range_.CheckKeyDone(batch_iter_)) { + // For L0, always advance the key because we will look in the next + // file regardless for all keys not found yet + if (current_level_range_.CheckKeyDone(batch_iter_) || + curr_level_ == 0) { ++batch_iter_; } } @@ -601,7 +590,8 @@ class FilePickerMultiGet { unsigned int start_index_in_curr_level; FilePickerContext(int32_t left, int32_t right) - : search_left_bound(left), search_right_bound(right) {} + : search_left_bound(left), search_right_bound(right), + curr_index_in_curr_level(0), start_index_in_curr_level(0) {} FilePickerContext() = default; }; @@ -619,9 +609,6 @@ class FilePickerMultiGet { bool maybe_repeat_key_; MultiGetRange current_level_range_; MultiGetRange current_file_range_; -#ifndef NDEBUG - std::vector* files_; -#endif autovector* level_files_brief_; bool search_ended_; bool is_hit_file_last_in_level_; @@ -629,9 +616,6 @@ class FilePickerMultiGet { FileIndexer* file_indexer_; const Comparator* user_comparator_; const InternalKeyComparator* internal_comparator_; -#ifndef NDEBUG - FdWithKeyRange* prev_file_; -#endif // Setup local variables to search next level. // Returns false if there are no more levels to search. @@ -640,9 +624,6 @@ class FilePickerMultiGet { MultiGetRange::Iterator mget_iter = current_level_range_.begin(); if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < curr_file_level_->num_files) { -#ifndef NDEBUG - prev_file_ = nullptr; -#endif batch_iter_prev_ = current_level_range_.begin(); batch_iter_ = current_level_range_.begin(); return true; @@ -738,9 +719,6 @@ class FilePickerMultiGet { fp_ctx.curr_index_in_curr_level = start_index; } if (level_contains_keys) { -#ifndef NDEBUG - prev_file_ = nullptr; -#endif batch_iter_prev_ = current_level_range_.begin(); batch_iter_ = current_level_range_.begin(); return true; @@ -820,14 +798,16 @@ static bool AfterFile(const Comparator* ucmp, const Slice* user_key, const FdWithKeyRange* f) { // nullptr user_key occurs before all keys and is therefore never after *f return (user_key != nullptr && - ucmp->Compare(*user_key, ExtractUserKey(f->largest_key)) > 0); + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->largest_key)) > 0); } static bool BeforeFile(const Comparator* ucmp, const Slice* user_key, const FdWithKeyRange* f) { // nullptr user_key occurs after all keys and is therefore never before *f return (user_key != nullptr && - ucmp->Compare(*user_key, ExtractUserKey(f->smallest_key)) < 0); + ucmp->CompareWithoutTimestamp(*user_key, + ExtractUserKey(f->smallest_key)) < 0); } bool SomeFileOverlapsRange( @@ -872,24 +852,26 @@ namespace { class LevelIterator final : public InternalIterator { public: - LevelIterator( - TableCache* table_cache, const ReadOptions& read_options, - const EnvOptions& env_options, const InternalKeyComparator& icomparator, - const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor, - bool should_sample, HistogramImpl* file_read_hist, bool for_compaction, - bool skip_filters, int level, RangeDelAggregator* range_del_agg, - const std::vector* compaction_boundaries = - nullptr) + LevelIterator(TableCache* table_cache, const ReadOptions& read_options, + const EnvOptions& env_options, + const InternalKeyComparator& icomparator, + const LevelFilesBrief* flevel, + const SliceTransform* prefix_extractor, bool should_sample, + HistogramImpl* file_read_hist, TableReaderCaller caller, + bool skip_filters, int level, RangeDelAggregator* range_del_agg, + const std::vector* + compaction_boundaries = nullptr) : table_cache_(table_cache), read_options_(read_options), env_options_(env_options), icomparator_(icomparator), user_comparator_(icomparator.user_comparator()), flevel_(flevel), - prefix_extractor_(prefix_extractor), + prefix_extractor_(read_options.total_order_seek ? nullptr + : prefix_extractor), file_read_hist_(file_read_hist), should_sample_(should_sample), - for_compaction_(for_compaction), + caller_(caller), skip_filters_(skip_filters), file_index_(flevel_->num_files), level_(level), @@ -907,7 +889,7 @@ class LevelIterator final : public InternalIterator { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { return file_iter_.Valid(); } @@ -915,30 +897,46 @@ class LevelIterator final : public InternalIterator { assert(Valid()); return file_iter_.key(); } + Slice value() const override { assert(Valid()); return file_iter_.value(); } + Status status() const override { return file_iter_.iter() ? file_iter_.status() : Status::OK(); } + + inline bool MayBeOutOfLowerBound() override { + assert(Valid()); + return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound(); + } + + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return file_iter_.MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; if (file_iter_.iter()) { file_iter_.SetPinnedItersMgr(pinned_iters_mgr); } } + bool IsKeyPinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsKeyPinned(); } + bool IsValuePinned() const override { return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && file_iter_.iter() && file_iter_.IsValuePinned(); } private: - void SkipEmptyFileForward(); + // Return true if at least one invalid file is seen and skipped. + bool SkipEmptyFileForward(); void SkipEmptyFileBackward(); void SetFileIterator(InternalIterator* iter); void InitFileIterator(size_t new_file_index); @@ -957,8 +955,9 @@ class LevelIterator final : public InternalIterator { bool KeyReachedUpperBound(const Slice& internal_key) { return read_options_.iterate_upper_bound != nullptr && - user_comparator_.Compare(ExtractUserKey(internal_key), - *read_options_.iterate_upper_bound) >= 0; + user_comparator_.CompareWithoutTimestamp( + ExtractUserKey(internal_key), + *read_options_.iterate_upper_bound) >= 0; } InternalIterator* NewFileIterator() { @@ -974,12 +973,27 @@ class LevelIterator final : public InternalIterator { smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest; largest_compaction_key = (*compaction_boundaries_)[file_index_].largest; } + CheckMayBeOutOfLowerBound(); return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *file_meta.file_metadata, range_del_agg_, prefix_extractor_, - nullptr /* don't need reference to table */, - file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, - level_, smallest_compaction_key, largest_compaction_key); + nullptr /* don't need reference to table */, file_read_hist_, caller_, + /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key, + largest_compaction_key); + } + + // Check if current file being fully within iterate_lower_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update may_be_out_of_lower_bound_ accordingly. + void CheckMayBeOutOfLowerBound() { + if (read_options_.iterate_lower_bound != nullptr && + file_index_ < flevel_->num_files) { + may_be_out_of_lower_bound_ = + user_comparator_.Compare( + ExtractUserKey(file_smallest_key(file_index_)), + *read_options_.iterate_lower_bound) < 0; + } } TableCache* table_cache_; @@ -993,8 +1007,9 @@ class LevelIterator final : public InternalIterator { HistogramImpl* file_read_hist_; bool should_sample_; - bool for_compaction_; + TableReaderCaller caller_; bool skip_filters_; + bool may_be_out_of_lower_bound_ = true; size_t file_index_; int level_; RangeDelAggregator* range_del_agg_; @@ -1007,13 +1022,55 @@ class LevelIterator final : public InternalIterator { }; void LevelIterator::Seek(const Slice& target) { - size_t new_file_index = FindFile(icomparator_, *flevel_, target); + // Check whether the seek key fall under the same file + bool need_to_reseek = true; + if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) { + const FdWithKeyRange& cur_file = flevel_->files[file_index_]; + if (icomparator_.InternalKeyComparator::Compare( + target, cur_file.largest_key) <= 0 && + icomparator_.InternalKeyComparator::Compare( + target, cur_file.smallest_key) >= 0) { + need_to_reseek = false; + assert(static_cast(FindFile(icomparator_, *flevel_, target)) == + file_index_); + } + } + if (need_to_reseek) { + TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile"); + size_t new_file_index = FindFile(icomparator_, *flevel_, target); + InitFileIterator(new_file_index); + } - InitFileIterator(new_file_index); if (file_iter_.iter() != nullptr) { file_iter_.Seek(target); } - SkipEmptyFileForward(); + if (SkipEmptyFileForward() && prefix_extractor_ != nullptr && + file_iter_.iter() != nullptr && file_iter_.Valid()) { + // We've skipped the file we initially positioned to. In the prefix + // seek case, it is likely that the file is skipped because of + // prefix bloom or hash, where more keys are skipped. We then check + // the current key and invalidate the iterator if the prefix is + // already passed. + // When doing prefix iterator seek, when keys for one prefix have + // been exhausted, it can jump to any key that is larger. Here we are + // enforcing a stricter contract than that, in order to make it easier for + // higher layers (merging and DB iterator) to reason the correctness: + // 1. Within the prefix, the result should be accurate. + // 2. If keys for the prefix is exhausted, it is either positioned to the + // next key after the prefix, or make the iterator invalid. + // A side benefit will be that it invalidates the iterator earlier so that + // the upper level merging iterator can merge fewer child iterators. + Slice target_user_key = ExtractUserKey(target); + Slice file_user_key = ExtractUserKey(file_iter_.key()); + if (prefix_extractor_->InDomain(target_user_key) && + (!prefix_extractor_->InDomain(file_user_key) || + user_comparator_.Compare( + prefix_extractor_->Transform(target_user_key), + prefix_extractor_->Transform(file_user_key)) != 0)) { + SetFileIterator(nullptr); + } + } + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekForPrev(const Slice& target) { @@ -1027,6 +1084,7 @@ void LevelIterator::SeekForPrev(const Slice& target) { file_iter_.SeekForPrev(target); SkipEmptyFileBackward(); } + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekToFirst() { @@ -1035,6 +1093,7 @@ void LevelIterator::SeekToFirst() { file_iter_.SeekToFirst(); } SkipEmptyFileForward(); + CheckMayBeOutOfLowerBound(); } void LevelIterator::SeekToLast() { @@ -1043,15 +1102,17 @@ void LevelIterator::SeekToLast() { file_iter_.SeekToLast(); } SkipEmptyFileBackward(); + CheckMayBeOutOfLowerBound(); } void LevelIterator::Next() { NextImpl(); } -bool LevelIterator::NextAndGetResult(Slice* ret_key) { +bool LevelIterator::NextAndGetResult(IterateResult* result) { NextImpl(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); } return is_valid; } @@ -1062,25 +1123,28 @@ void LevelIterator::Prev() { SkipEmptyFileBackward(); } -void LevelIterator::SkipEmptyFileForward() { +bool LevelIterator::SkipEmptyFileForward() { + bool seen_empty_file = false; while (file_iter_.iter() == nullptr || (!file_iter_.Valid() && file_iter_.status().ok() && !file_iter_.iter()->IsOutOfBound())) { + seen_empty_file = true; // Move to next file if (file_index_ >= flevel_->num_files - 1) { // Already at the last file SetFileIterator(nullptr); - return; + break; } if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) { SetFileIterator(nullptr); - return; + break; } InitFileIterator(file_index_ + 1); if (file_iter_.iter() != nullptr) { file_iter_.SeekToFirst(); } } + return seen_empty_file; } void LevelIterator::SkipEmptyFileBackward() { @@ -1198,8 +1262,7 @@ Status Version::GetTableProperties(std::shared_ptr* tp, new RandomAccessFileReader( std::move(file), file_name, nullptr /* env */, nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, - nullptr /* rate_limiter */, false /* for_compaction*/, - ioptions->listeners)); + nullptr /* rate_limiter */, ioptions->listeners)); s = ReadTableProperties( file_reader.get(), file_meta->fd.GetFileSize(), Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions, @@ -1225,6 +1288,60 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { return Status::OK(); } +Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str) { + if (max_entries_to_print <= 0) { + return Status::OK(); + } + int num_entries_left = max_entries_to_print; + + std::stringstream ss; + + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.files_[level]) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + + ss << "=== file : " << fname << " ===\n"; + + TableCache* table_cache = cfd_->table_cache(); + std::unique_ptr tombstone_iter; + + Status s = table_cache->GetRangeTombstoneIterator( + ReadOptions(), cfd_->internal_comparator(), *file_meta, + &tombstone_iter); + if (!s.ok()) { + return s; + } + if (tombstone_iter) { + tombstone_iter->SeekToFirst(); + + while (tombstone_iter->Valid() && num_entries_left > 0) { + ss << "start: " << tombstone_iter->start_key().ToString(true) + << " end: " << tombstone_iter->end_key().ToString(true) + << " seq: " << tombstone_iter->seq() << '\n'; + tombstone_iter->Next(); + num_entries_left--; + } + if (num_entries_left <= 0) { + break; + } + } + } + if (num_entries_left <= 0) { + break; + } + } + assert(num_entries_left >= 0); + if (num_entries_left <= 0) { + ss << "(results may not be complete)\n"; + } + + *out_str = ss.str(); + return Status::OK(); +} + Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, int level) { for (const auto& file_meta : storage_info_.files_[level]) { @@ -1335,16 +1452,15 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { assert(!ioptions->cf_paths.empty()); file_path = ioptions->cf_paths.back().path; } + const uint64_t file_number = file->fd.GetNumber(); files.emplace_back(SstFileMetaData{ - MakeTableFileName("", file->fd.GetNumber()), - file_path, - static_cast(file->fd.GetFileSize()), - file->fd.smallest_seqno, - file->fd.largest_seqno, - file->smallest.user_key().ToString(), + MakeTableFileName("", file_number), file_number, file_path, + static_cast(file->fd.GetFileSize()), file->fd.smallest_seqno, + file->fd.largest_seqno, file->smallest.user_key().ToString(), file->largest.user_key().ToString(), file->stats.num_reads_sampled.load(std::memory_order_relaxed), - file->being_compacted}); + file->being_compacted, file->oldest_blob_file_number, + file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime()}); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; level_size += file->fd.GetFileSize(); @@ -1365,6 +1481,24 @@ uint64_t Version::GetSstFilesSize() { return sst_files_size; } +void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { + uint64_t oldest_time = port::kMaxUint64; + for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { + for (FileMetaData* meta : storage_info_.LevelFiles(level)) { + assert(meta->fd.table_reader != nullptr); + uint64_t file_creation_time = meta->TryGetFileCreationTime(); + if (file_creation_time == kUnknownFileCreationTime) { + *creation_time = 0; + return; + } + if (file_creation_time < oldest_time) { + oldest_time = file_creation_time; + } + } + } + *creation_time = oldest_time; +} + uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { // Estimation will be inaccurate when: // (1) there exist merge keys @@ -1446,10 +1580,14 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { const auto& file = storage_info_.LevelFilesBrief(0).files[i]; merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( - read_options, soptions, cfd_->internal_comparator(), *file.file_metadata, - range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr, - cfd_->internal_stats()->GetFileReadHist(0), false, arena, - false /* skip_filters */, 0 /* level */)); + read_options, soptions, cfd_->internal_comparator(), + *file.file_metadata, range_del_agg, + mutable_cf_options_.prefix_extractor.get(), nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, arena, + /*skip_filters=*/false, /*level=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); } if (should_sample) { // Count ones for every L0 files. This is done per iterator creation @@ -1470,8 +1608,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), - false /* for_compaction */, IsFilterSkipped(level), level, - range_del_agg)); + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, + range_del_agg, /*largest_compaction_key=*/nullptr)); } } @@ -1500,10 +1638,14 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, continue; } ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( - read_options, env_options, cfd_->internal_comparator(), *file->file_metadata, - &range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr, - cfd_->internal_stats()->GetFileReadHist(0), false, &arena, - false /* skip_filters */, 0 /* level */)); + read_options, env_options, cfd_->internal_comparator(), + *file->file_metadata, &range_del_agg, + mutable_cf_options_.prefix_extractor.get(), nullptr, + cfd_->internal_stats()->GetFileReadHist(0), + TableReaderCaller::kUserIterator, &arena, + /*skip_filters=*/false, /*level=*/0, + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr)); status = OverlapWithIterator( ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); if (!status.ok() || *overlap) { @@ -1517,7 +1659,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), - false /* for_compaction */, IsFilterSkipped(level), level, + TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, &range_del_agg)); status = OverlapWithIterator( ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); @@ -1611,7 +1753,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, bool* value_found, bool* key_exists, SequenceNumber* seq, ReadCallback* callback, - bool* is_blob) { + bool* is_blob, bool do_merge) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); @@ -1623,11 +1765,18 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } PinnedIteratorsManager pinned_iters_mgr; + uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId; + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_get_id = vset_->block_cache_tracer_->NextGetId(); + } GetContext get_context( user_comparator(), merge_operator_, info_log_, db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, - value, value_found, merge_context, max_covering_tombstone_seq, this->env_, - seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob); + do_merge ? value : nullptr, value_found, merge_context, do_merge, + max_covering_tombstone_seq, this->env_, seq, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, + tracing_get_id); // Pin blocks that we read to hold merge operands if (merge_operator_) { @@ -1691,7 +1840,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } else if (fp.GetHitFileLevel() >= 2) { RecordTick(db_statistics_, GET_HIT_L2_AND_UP); } - PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel()); + PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, + fp.GetHitFileLevel()); return; case GetContext::kDeleted: // Use empty error message for speed @@ -1709,11 +1859,14 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } f = fp.GetNextFile(); } - if (db_statistics_ != nullptr) { get_context.ReportCounters(); } if (GetContext::kMerge == get_context.State()) { + if (!do_merge) { + *status = Status::OK(); + return; + } if (!merge_operator_) { *status = Status::InvalidArgument( "merge_operator is not properly initialized."); @@ -1745,7 +1898,12 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, if (merge_operator_) { pinned_iters_mgr.StartPinning(); } + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + if (vset_ && vset_->block_cache_tracer_ && + vset_->block_cache_tracer_->is_tracing_enabled()) { + tracing_mget_id = vset_->block_cache_tracer_->NextGetId(); + } // Even though we know the batch size won't be > MAX_BATCH_SIZE, // use autovector in order to avoid unnecessary construction of GetContext // objects, which is expensive @@ -1755,15 +1913,20 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, get_ctx.emplace_back( user_comparator(), merge_operator_, info_log_, db_statistics_, iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey, - iter->value, nullptr, &(iter->merge_context), - &iter->max_covering_tombstone_seq, this->env_, &iter->seq, - merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob); - iter->get_context = &get_ctx.back(); + iter->value, nullptr, &(iter->merge_context), true, + &iter->max_covering_tombstone_seq, this->env_, nullptr, + merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob, + tracing_mget_id); + } + int get_ctx_index = 0; + for (auto iter = range->begin(); iter != range->end(); + ++iter, get_ctx_index++) { + iter->get_context = &(get_ctx[get_ctx_index]); } MultiGetRange file_picker_range(*range, range->begin(), range->end()); FilePickerMultiGet fp( - storage_info_.files_, &file_picker_range, + &file_picker_range, &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_, user_comparator(), internal_comparator()); FdWithKeyRange* f = fp.GetNextFile(); @@ -2167,13 +2330,11 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions, auto status = ioptions.env->GetCurrentTime(&_current_time); if (status.ok()) { const uint64_t current_time = static_cast(_current_time); - for (auto f : files) { - if (!f->being_compacted && f->fd.table_reader != nullptr && - f->fd.table_reader->GetTableProperties() != nullptr) { - auto creation_time = - f->fd.table_reader->GetTableProperties()->creation_time; - if (creation_time > 0 && - creation_time < (current_time - mutable_cf_options.ttl)) { + for (FileMetaData* f : files) { + if (!f->being_compacted) { + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time != 0 && + oldest_ancester_time < (current_time - mutable_cf_options.ttl)) { ttl_expired_files_count++; } } @@ -2325,12 +2486,11 @@ void VersionStorageInfo::ComputeExpiredTtlFiles( const uint64_t current_time = static_cast(_current_time); for (int level = 0; level < num_levels() - 1; level++) { - for (auto f : files_[level]) { - if (!f->being_compacted && f->fd.table_reader != nullptr && - f->fd.table_reader->GetTableProperties() != nullptr) { - auto creation_time = - f->fd.table_reader->GetTableProperties()->creation_time; - if (creation_time > 0 && creation_time < (current_time - ttl)) { + for (FileMetaData* f : files_[level]) { + if (!f->being_compacted) { + uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); + if (oldest_ancester_time > 0 && + oldest_ancester_time < (current_time - ttl)) { expired_ttl_files_.emplace_back(level, f); } } @@ -2351,26 +2511,30 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( return; } const uint64_t current_time = static_cast(temp_current_time); + + // If periodic_compaction_seconds is larger than current time, periodic + // compaction can't possibly be triggered. + if (periodic_compaction_seconds > current_time) { + return; + } + const uint64_t allowed_time_limit = current_time - periodic_compaction_seconds; for (int level = 0; level < num_levels(); level++) { for (auto f : files_[level]) { - if (!f->being_compacted && f->fd.table_reader != nullptr && - f->fd.table_reader->GetTableProperties() != nullptr) { + if (!f->being_compacted) { // Compute a file's modification time in the following order: // 1. Use file_creation_time table property if it is > 0. // 2. Use creation_time table property if it is > 0. // 3. Use file's mtime metadata if the above two table properties are 0. // Don't consider the file at all if the modification time cannot be // correctly determined based on the above conditions. - uint64_t file_modification_time = - f->fd.table_reader->GetTableProperties()->file_creation_time; - if (file_modification_time == 0) { - file_modification_time = - f->fd.table_reader->GetTableProperties()->creation_time; + uint64_t file_modification_time = f->TryGetFileCreationTime(); + if (file_modification_time == kUnknownFileCreationTime) { + file_modification_time = f->TryGetOldestAncesterTime(); } - if (file_modification_time == 0) { + if (file_modification_time == kUnknownOldestAncesterTime) { auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(), f->fd.GetPathId()); status = ioptions.env->GetFileModificationTime( @@ -2753,11 +2917,12 @@ void VersionStorageInfo::GetOverlappingInputs( FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]); const Slice file_start = ExtractUserKey(f->smallest_key); const Slice file_limit = ExtractUserKey(f->largest_key); - if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { + if (begin != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) { // "f" is completely before specified range; skip it iter++; } else if (end != nullptr && - user_cmp->Compare(file_start, user_end) > 0) { + user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) { // "f" is completely after specified range; skip it iter++; } else { @@ -2772,10 +2937,11 @@ void VersionStorageInfo::GetOverlappingInputs( iter = index.erase(iter); if (expand_range) { if (begin != nullptr && - user_cmp->Compare(file_start, user_begin) < 0) { + user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) { user_begin = file_start; } - if (end != nullptr && user_cmp->Compare(file_limit, user_end) > 0) { + if (end != nullptr && + user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) { user_end = file_limit; } } @@ -3098,7 +3264,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, // base_bytes_min. We set it be base_bytes_min. base_level_size = base_bytes_min + 1U; base_level_ = first_non_empty_level; - ROCKS_LOG_WARN(ioptions.info_log, + ROCKS_LOG_INFO(ioptions.info_log, "More existing levels in DB than needed. " "max_bytes_for_level_multiplier may not be guaranteed."); } else { @@ -3261,6 +3427,10 @@ std::string Version::DebugString(bool hex, bool print_stats) const { r.append(" .. "); r.append(files[i]->largest.DebugString(hex)); r.append("]"); + if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) { + r.append(" blob_file:"); + AppendNumberTo(&r, files[i]->oldest_blob_file_number); + } if (print_stats) { r.append("("); r.append(ToString( @@ -3292,14 +3462,60 @@ struct VersionSet::ManifestWriter { edit_list(e) {} }; +Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { + assert(edit); + if (edit->is_in_atomic_group_) { + TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup"); + if (replay_buffer_.empty()) { + replay_buffer_.resize(edit->remaining_entries_ + 1); + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit); + } + read_edits_in_atomic_group_++; + if (read_edits_in_atomic_group_ + edit->remaining_entries_ != + static_cast(replay_buffer_.size())) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit); + return Status::Corruption("corrupted atomic group"); + } + replay_buffer_[read_edits_in_atomic_group_ - 1] = std::move(*edit); + if (read_edits_in_atomic_group_ == replay_buffer_.size()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit); + return Status::OK(); + } + return Status::OK(); + } + + // A normal edit. + if (!replay_buffer().empty()) { + TEST_SYNC_POINT_CALLBACK( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit); + return Status::Corruption("corrupted atomic group"); + } + return Status::OK(); +} + +bool AtomicGroupReadBuffer::IsFull() const { + return read_edits_in_atomic_group_ == replay_buffer_.size(); +} + +bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); } + +void AtomicGroupReadBuffer::Clear() { + read_edits_in_atomic_group_ = 0; + replay_buffer_.clear(); +} + VersionSet::VersionSet(const std::string& dbname, const ImmutableDBOptions* _db_options, const EnvOptions& storage_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller) - : column_family_set_( - new ColumnFamilySet(dbname, _db_options, storage_options, table_cache, - write_buffer_manager, write_controller)), + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer) + : column_family_set_(new ColumnFamilySet( + dbname, _db_options, storage_options, table_cache, + write_buffer_manager, write_controller, block_cache_tracer)), env_(_db_options->env), dbname_(dbname), db_options_(_db_options), @@ -3313,18 +3529,13 @@ VersionSet::VersionSet(const std::string& dbname, prev_log_number_(0), current_version_number_(0), manifest_file_size_(0), - env_options_(storage_options) {} - -void CloseTables(void* ptr, size_t) { - TableReader* table_reader = reinterpret_cast(ptr); - table_reader->Close(); -} + env_options_(storage_options), + block_cache_tracer_(block_cache_tracer) {} VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on // VersionSet Cache* table_cache = column_family_set_->get_table_cache(); - table_cache->ApplyToAllCacheEntries(&CloseTables, false /* thread_safe */); column_family_set_.reset(); for (auto& file : obsolete_files_) { if (file.metadata->table_reader_handle) { @@ -3464,7 +3675,14 @@ Status VersionSet::ProcessManifestWrites( } else if (group_start != std::numeric_limits::max()) { group_start = std::numeric_limits::max(); } - LogAndApplyHelper(last_writer->cfd, builder, e, mu); + Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu); + if (!s.ok()) { + // free up the allocated memory + for (auto v : versions) { + delete v; + } + return s; + } batch_edits.push_back(e); } } @@ -3472,7 +3690,14 @@ Status VersionSet::ProcessManifestWrites( assert(!builder_guards.empty() && builder_guards.size() == versions.size()); auto* builder = builder_guards[i]->version_builder(); - builder->SaveTo(versions[i]->storage_info()); + Status s = builder->SaveTo(versions[i]->storage_info()); + if (!s.ok()) { + // free up the allocated memory + for (auto v : versions) { + delete v; + } + return s; + } } } @@ -3575,7 +3800,7 @@ Status VersionSet::ProcessManifestWrites( nullptr, db_options_->listeners)); descriptor_log_.reset( new log::Writer(std::move(file_writer), 0, false)); - s = WriteSnapshot(descriptor_log_.get()); + s = WriteCurrentStateToManifest(descriptor_log_.get()); } } @@ -3601,8 +3826,9 @@ Status VersionSet::ProcessManifestWrites( rocksdb_kill_odds * REDUCE_ODDS2); #ifndef NDEBUG if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) { - TEST_SYNC_POINT( - "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"); + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0", + nullptr); TEST_SYNC_POINT( "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"); } @@ -3821,8 +4047,6 @@ Status VersionSet::LogAndApply( } } if (0 == num_undropped_cfds) { - // TODO (yanqin) maybe use a different status code to denote column family - // drop other than OK and ShutdownInProgress for (int i = 0; i != num_cfds; ++i) { manifest_writers_.pop_front(); } @@ -3830,7 +4054,7 @@ Status VersionSet::LogAndApply( if (!manifest_writers_.empty()) { manifest_writers_.front()->cv.Signal(); } - return Status::ShutdownInProgress(); + return Status::ColumnFamilyDropped(); } return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log, @@ -3854,9 +4078,9 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { } } -void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, - VersionBuilder* builder, VersionEdit* edit, - InstrumentedMutex* mu) { +Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, + VersionBuilder* builder, VersionEdit* edit, + InstrumentedMutex* mu) { #ifdef NDEBUG (void)cfd; #endif @@ -3880,7 +4104,9 @@ void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_ : last_sequence_); - builder->Apply(edit); + Status s = builder->Apply(edit); + + return s; } Status VersionSet::ApplyOneVersionEditToBuilder( @@ -3889,10 +4115,7 @@ Status VersionSet::ApplyOneVersionEditToBuilder( std::unordered_map& column_families_not_found, std::unordered_map>& builders, - bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, - uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, - bool* have_last_sequence, SequenceNumber* last_sequence, - uint64_t* min_log_number_to_keep, uint32_t* max_column_family) { + VersionEditParams* version_edit_params) { // Not found means that user didn't supply that column // family option AND we encountered column family add // record. Once we encounter column family drop record, @@ -3916,11 +4139,23 @@ Status VersionSet::ApplyOneVersionEditToBuilder( edit.column_family_name_); } auto cf_options = name_to_options.find(edit.column_family_name_); - if (cf_options == name_to_options.end()) { + // implicitly add persistent_stats column family without requiring user + // to specify + bool is_persistent_stats_column_family = + edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; + if (cf_options == name_to_options.end() && + !is_persistent_stats_column_family) { column_families_not_found.insert( {edit.column_family_, edit.column_family_name_}); } else { - cfd = CreateColumnFamily(cf_options->second, &edit); + // recover persistent_stats CF from a DB that already contains it + if (is_persistent_stats_column_family) { + ColumnFamilyOptions cfo; + OptimizeForPersistentStats(&cfo); + cfd = CreateColumnFamily(cfo, &edit); + } else { + cfd = CreateColumnFamily(cf_options->second, &edit); + } cfd->set_initialized(); builders.insert(std::make_pair( edit.column_family_, std::unique_ptr( @@ -3961,71 +4196,73 @@ Status VersionSet::ApplyOneVersionEditToBuilder( // to builder auto builder = builders.find(edit.column_family_); assert(builder != builders.end()); - builder->second->version_builder()->Apply(&edit); + Status s = builder->second->version_builder()->Apply(&edit); + if (!s.ok()) { + return s; + } } - return ExtractInfoFromVersionEdit( - cfd, edit, have_log_number, log_number, have_prev_log_number, - previous_log_number, have_next_file, next_file, have_last_sequence, - last_sequence, min_log_number_to_keep, max_column_family); + return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params); } Status VersionSet::ExtractInfoFromVersionEdit( - ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number, - uint64_t* log_number, bool* have_prev_log_number, - uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, - bool* have_last_sequence, SequenceNumber* last_sequence, - uint64_t* min_log_number_to_keep, uint32_t* max_column_family) { + ColumnFamilyData* cfd, const VersionEdit& from_edit, + VersionEditParams* version_edit_params) { if (cfd != nullptr) { - if (edit.has_log_number_) { - if (cfd->GetLogNumber() > edit.log_number_) { + if (from_edit.has_db_id_) { + version_edit_params->SetDBId(from_edit.db_id_); + } + if (from_edit.has_log_number_) { + if (cfd->GetLogNumber() > from_edit.log_number_) { ROCKS_LOG_WARN( db_options_->info_log, "MANIFEST corruption detected, but ignored - Log numbers in " "records NOT monotonically increasing"); } else { - cfd->SetLogNumber(edit.log_number_); - *have_log_number = true; - *log_number = edit.log_number_; + cfd->SetLogNumber(from_edit.log_number_); + version_edit_params->SetLogNumber(from_edit.log_number_); } } - if (edit.has_comparator_ && - edit.comparator_ != cfd->user_comparator()->Name()) { + if (from_edit.has_comparator_ && + from_edit.comparator_ != cfd->user_comparator()->Name()) { return Status::InvalidArgument( cfd->user_comparator()->Name(), - "does not match existing comparator " + edit.comparator_); + "does not match existing comparator " + from_edit.comparator_); } } - if (edit.has_prev_log_number_) { - *previous_log_number = edit.prev_log_number_; - *have_prev_log_number = true; + if (from_edit.has_prev_log_number_) { + version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_); } - if (edit.has_next_file_number_) { - *next_file = edit.next_file_number_; - *have_next_file = true; + if (from_edit.has_next_file_number_) { + version_edit_params->SetNextFile(from_edit.next_file_number_); } - if (edit.has_max_column_family_) { - *max_column_family = edit.max_column_family_; + if (from_edit.has_max_column_family_) { + version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_); } - if (edit.has_min_log_number_to_keep_) { - *min_log_number_to_keep = - std::max(*min_log_number_to_keep, edit.min_log_number_to_keep_); + if (from_edit.has_min_log_number_to_keep_) { + version_edit_params->min_log_number_to_keep_ = + std::max(version_edit_params->min_log_number_to_keep_, + from_edit.min_log_number_to_keep_); } - if (edit.has_last_sequence_) { - *last_sequence = edit.last_sequence_; - *have_last_sequence = true; + if (from_edit.has_last_sequence_) { + version_edit_params->SetLastSequence(from_edit.last_sequence_); } return Status::OK(); } -Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) { +Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Env* env, + std::string* manifest_path, + uint64_t* manifest_file_number) { + assert(env != nullptr); assert(manifest_path != nullptr); + assert(manifest_file_number != nullptr); + std::string fname; - Status s = ReadFileToString(env_, CurrentFileName(dbname_), &fname); + Status s = ReadFileToString(env, CurrentFileName(dbname), &fname); if (!s.ok()) { return s; } @@ -4035,21 +4272,87 @@ Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) { // remove the trailing '\n' fname.resize(fname.size() - 1); FileType type; - bool parse_ok = ParseFileName(fname, &manifest_file_number_, &type); + bool parse_ok = ParseFileName(fname, manifest_file_number, &type); if (!parse_ok || type != kDescriptorFile) { return Status::Corruption("CURRENT file corrupted"); } - *manifest_path = dbname_; - if (dbname_.back() != '/') { + *manifest_path = dbname; + if (dbname.back() != '/') { manifest_path->push_back('/'); } *manifest_path += fname; return Status::OK(); } +Status VersionSet::ReadAndRecover( + log::Reader* reader, AtomicGroupReadBuffer* read_buffer, + const std::unordered_map& name_to_options, + std::unordered_map& column_families_not_found, + std::unordered_map>& + builders, + VersionEditParams* version_edit_params, std::string* db_id) { + assert(reader != nullptr); + assert(read_buffer != nullptr); + Status s; + Slice record; + std::string scratch; + size_t recovered_edits = 0; + while (reader->ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = edit.DecodeFrom(record); + if (!s.ok()) { + break; + } + if (edit.has_db_id_) { + db_id_ = edit.GetDbId(); + if (db_id != nullptr) { + db_id->assign(edit.GetDbId()); + } + } + s = read_buffer->AddEdit(&edit); + if (!s.ok()) { + break; + } + if (edit.is_in_atomic_group_) { + if (read_buffer->IsFull()) { + // Apply edits in an atomic group when we have read all edits in the + // group. + for (auto& e : read_buffer->replay_buffer()) { + s = ApplyOneVersionEditToBuilder(e, name_to_options, + column_families_not_found, builders, + version_edit_params); + if (!s.ok()) { + break; + } + recovered_edits++; + } + if (!s.ok()) { + break; + } + read_buffer->Clear(); + } + } else { + // Apply a normal edit immediately. + s = ApplyOneVersionEditToBuilder(edit, name_to_options, + column_families_not_found, builders, + version_edit_params); + if (s.ok()) { + recovered_edits++; + } + } + } + if (!s.ok()) { + // Clear the buffer if we fail to decode/apply an edit. + read_buffer->Clear(); + } + TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits", + &recovered_edits); + return s; +} + Status VersionSet::Recover( - const std::vector& column_families, - bool read_only) { + const std::vector& column_families, bool read_only, + std::string* db_id) { std::unordered_map cf_name_to_options; for (auto cf : column_families) { cf_name_to_options.insert({cf.name, cf.options}); @@ -4061,7 +4364,8 @@ Status VersionSet::Recover( // Read "CURRENT" file, which contains a pointer to the current manifest file std::string manifest_path; - Status s = GetCurrentManifestPath(&manifest_path); + Status s = GetCurrentManifestPath(dbname_, env_, &manifest_path, + &manifest_file_number_); if (!s.ok()) { return s; } @@ -4078,7 +4382,8 @@ Status VersionSet::Recover( return s; } manifest_file_reader.reset( - new SequentialFileReader(std::move(manifest_file), manifest_path)); + new SequentialFileReader(std::move(manifest_file), manifest_path, + db_options_->log_readahead_size)); } uint64_t current_manifest_file_size; s = env_->GetFileSize(manifest_path, ¤t_manifest_file_size); @@ -4086,16 +4391,6 @@ Status VersionSet::Recover( return s; } - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t previous_log_number = 0; - uint32_t max_column_family = 0; - uint64_t min_log_number_to_keep = 0; std::unordered_map> builders; @@ -4115,7 +4410,7 @@ Status VersionSet::Recover( builders.insert( std::make_pair(0, std::unique_ptr( new BaseReferencedVersionBuilder(default_cfd)))); - + VersionEditParams version_edit_params; { VersionSet::LogReporter reporter; reporter.status = &s; @@ -4123,88 +4418,33 @@ Status VersionSet::Recover( true /* checksum */, 0 /* log_number */); Slice record; std::string scratch; - std::vector replay_buffer; - size_t num_entries_decoded = 0; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - - if (edit.is_in_atomic_group_) { - if (replay_buffer.empty()) { - replay_buffer.resize(edit.remaining_entries_ + 1); - TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:FirstInAtomicGroup", - &edit); - } - ++num_entries_decoded; - if (num_entries_decoded + edit.remaining_entries_ != - static_cast(replay_buffer.size())) { - TEST_SYNC_POINT_CALLBACK( - "VersionSet::Recover:IncorrectAtomicGroupSize", &edit); - s = Status::Corruption("corrupted atomic group"); - break; - } - replay_buffer[num_entries_decoded - 1] = std::move(edit); - if (num_entries_decoded == replay_buffer.size()) { - TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:LastInAtomicGroup", - &edit); - for (auto& e : replay_buffer) { - s = ApplyOneVersionEditToBuilder( - e, cf_name_to_options, column_families_not_found, builders, - &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); - if (!s.ok()) { - break; - } - } - replay_buffer.clear(); - num_entries_decoded = 0; - } - TEST_SYNC_POINT("VersionSet::Recover:AtomicGroup"); - } else { - if (!replay_buffer.empty()) { - TEST_SYNC_POINT_CALLBACK( - "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", &edit); - s = Status::Corruption("corrupted atomic group"); - break; - } - s = ApplyOneVersionEditToBuilder( - edit, cf_name_to_options, column_families_not_found, builders, - &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); - } - if (!s.ok()) { - break; - } - } + AtomicGroupReadBuffer read_buffer; + s = ReadAndRecover(&reader, &read_buffer, cf_name_to_options, + column_families_not_found, builders, + &version_edit_params, db_id); } if (s.ok()) { - if (!have_next_file) { + if (!version_edit_params.has_next_file_number_) { s = Status::Corruption("no meta-nextfile entry in descriptor"); - } else if (!have_log_number) { + } else if (!version_edit_params.has_log_number_) { s = Status::Corruption("no meta-lognumber entry in descriptor"); - } else if (!have_last_sequence) { + } else if (!version_edit_params.has_last_sequence_) { s = Status::Corruption("no last-sequence-number entry in descriptor"); } - if (!have_prev_log_number) { - previous_log_number = 0; + if (!version_edit_params.has_prev_log_number_) { + version_edit_params.SetPrevLogNumber(0); } - column_family_set_->UpdateMaxColumnFamily(max_column_family); + column_family_set_->UpdateMaxColumnFamily( + version_edit_params.max_column_family_); // When reading DB generated using old release, min_log_number_to_keep=0. // All log files will be scanned for potential prepare entries. - MarkMinLogNumberToKeep2PC(min_log_number_to_keep); - MarkFileNumberUsed(previous_log_number); - MarkFileNumberUsed(log_number); + MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_); + MarkFileNumberUsed(version_edit_params.prev_log_number_); + MarkFileNumberUsed(version_edit_params.log_number_); } // there were some column families in the MANIFEST that weren't specified @@ -4265,11 +4505,11 @@ Status VersionSet::Recover( } manifest_file_size_ = current_manifest_file_size; - next_file_number_.store(next_file + 1); - last_allocated_sequence_ = last_sequence; - last_published_sequence_ = last_sequence; - last_sequence_ = last_sequence; - prev_log_number_ = previous_log_number; + next_file_number_.store(version_edit_params.next_file_number_ + 1); + last_allocated_sequence_ = version_edit_params.last_sequence_; + last_published_sequence_ = version_edit_params.last_sequence_; + last_sequence_ = version_edit_params.last_sequence_; + prev_log_number_ = version_edit_params.prev_log_number_; ROCKS_LOG_INFO( db_options_->info_log, @@ -4278,8 +4518,8 @@ Status VersionSet::Recover( ", last_sequence is %" PRIu64 ", log_number is %" PRIu64 ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32 ",min_log_number_to_keep is %" PRIu64 "\n", - manifest_path.c_str(), manifest_file_number_, - next_file_number_.load(), last_sequence_.load(), log_number, + manifest_path.c_str(), manifest_file_number_, next_file_number_.load(), + last_sequence_.load(), version_edit_params.log_number_, prev_log_number_, column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc()); @@ -4303,26 +4543,22 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, // so we're fine using the defaults EnvOptions soptions; // Read "CURRENT" file, which contains a pointer to the current manifest file - std::string current; - Status s = ReadFileToString(env, CurrentFileName(dbname), ¤t); + std::string manifest_path; + uint64_t manifest_file_number; + Status s = GetCurrentManifestPath(dbname, env, &manifest_path, + &manifest_file_number); if (!s.ok()) { return s; } - if (current.empty() || current[current.size()-1] != '\n') { - return Status::Corruption("CURRENT file does not end with newline"); - } - current.resize(current.size() - 1); - - std::string dscname = dbname + "/" + current; std::unique_ptr file_reader; { std::unique_ptr file; - s = env->NewSequentialFile(dscname, &file, soptions); + s = env->NewSequentialFile(manifest_path, &file, soptions); if (!s.ok()) { return s; } - file_reader.reset(new SequentialFileReader(std::move(file), dscname)); + file_reader.reset(new SequentialFileReader(std::move(file), manifest_path)); } std::map column_family_names; @@ -4385,7 +4621,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, options->table_cache_numshardbits)); WriteController wc(options->delayed_write_rate); WriteBufferManager wb(options->db_write_buffer_size); - VersionSet versions(dbname, &db_options, env_options, tc.get(), &wb, &wc); + VersionSet versions(dbname, &db_options, env_options, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr); Status status; std::vector dummy; @@ -4429,7 +4666,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, } // we need to allocate an array with the old number of levels size to - // avoid SIGSEGV in WriteSnapshot() + // avoid SIGSEGV in WriteCurrentStatetoManifest() // however, all levels bigger or equal to new_levels will be empty std::vector* new_files_list = new std::vector[current_levels]; @@ -4466,7 +4703,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, if (!s.ok()) { return s; } - file_reader.reset(new SequentialFileReader(std::move(file), dscname)); + file_reader.reset(new SequentialFileReader( + std::move(file), dscname, db_options_->log_readahead_size)); } bool have_prev_log_number = false; @@ -4562,7 +4800,10 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, // to builder auto builder = builders.find(edit.column_family_); assert(builder != builders.end()); - builder->second->version_builder()->Apply(&edit); + s = builder->second->version_builder()->Apply(&edit); + if (!s.ok()) { + break; + } } if (cfd != nullptr && edit.has_log_number_) { @@ -4665,7 +4906,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { next_file_number_.store(number + 1, std::memory_order_relaxed); } } - // Called only either from ::LogAndApply which is protected by mutex or during // recovery which is single-threaded. void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) { @@ -4674,7 +4914,7 @@ void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) { } } -Status VersionSet::WriteSnapshot(log::Writer* log) { +Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) { // TODO: Break up into multiple records to reduce memory usage on recovery? // WARNING: This method doesn't hold a mutex!! @@ -4682,6 +4922,22 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // This is done without DB mutex lock held, but only within single-threaded // LogAndApply. Column family manipulations can only happen within LogAndApply // (the same single thread), so we're safe to iterate. + + if (db_options_->write_dbid_to_manifest) { + VersionEdit edit_for_db_id; + assert(!db_id_.empty()); + edit_for_db_id.SetDBId(db_id_); + std::string db_id_record; + if (!edit_for_db_id.EncodeTo(&db_id_record)) { + return Status::Corruption("Unable to Encode VersionEdit:" + + edit_for_db_id.DebugString(true)); + } + Status add_record = log->AddRecord(db_id_record); + if (!add_record.ok()) { + return add_record; + } + } + for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { continue; @@ -4720,7 +4976,8 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, - f->marked_for_compaction); + f->marked_for_compaction, f->oldest_blob_file_number, + f->oldest_ancester_time, f->file_creation_time); } } edit.SetLogNumber(cfd->GetLogNumber()); @@ -4735,7 +4992,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { } } } - return Status::OK(); } @@ -4745,111 +5001,198 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where // we avoid doing binary search for the keys b and c twice and instead somehow // maintain state of where they first appear in the files. -uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start, +uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + Version* v, const Slice& start, const Slice& end, int start_level, - int end_level) { + int end_level, TableReaderCaller caller) { + const auto& icmp = v->cfd_->internal_comparator(); + // pre-condition - assert(v->cfd_->internal_comparator().Compare(start, end) <= 0); + assert(icmp.Compare(start, end) <= 0); - uint64_t size = 0; + uint64_t total_full_size = 0; const auto* vstorage = v->storage_info(); - end_level = end_level == -1 - ? vstorage->num_non_empty_levels() - : std::min(end_level, vstorage->num_non_empty_levels()); + const int num_non_empty_levels = vstorage->num_non_empty_levels(); + end_level = (end_level == -1) ? num_non_empty_levels + : std::min(end_level, num_non_empty_levels); assert(start_level <= end_level); - for (int level = start_level; level < end_level; level++) { + // Outline of the optimization that uses options.files_size_error_margin. + // When approximating the files total size that is used to store a keys range, + // we first sum up the sizes of the files that fully fall into the range. + // Then we sum up the sizes of all the files that may intersect with the range + // (this includes all files in L0 as well). Then, if total_intersecting_size + // is smaller than total_full_size * options.files_size_error_margin - we can + // infer that the intersecting files have a sufficiently negligible + // contribution to the total size, and we can approximate the storage required + // for the keys in range as just half of the intersecting_files_size. + // E.g., if the value of files_size_error_margin is 0.1, then the error of the + // approximation is limited to only ~10% of the total size of files that fully + // fall into the keys range. In such case, this helps to avoid a costly + // process of binary searching the intersecting files that is required only + // for a more precise calculation of the total size. + + autovector first_files; + autovector last_files; + + // scan all the levels + for (int level = start_level; level < end_level; ++level) { const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level); - if (!files_brief.num_files) { + if (files_brief.num_files == 0) { // empty level, skip exploration continue; } - if (!level) { - // level 0 data is sorted order, handle the use case explicitly - size += ApproximateSizeLevel0(v, files_brief, start, end); + if (level == 0) { + // level 0 files are not in sorted order, we need to iterate through + // the list to compute the total bytes that require scanning, + // so handle the case explicitly (similarly to first_files case) + for (size_t i = 0; i < files_brief.num_files; i++) { + first_files.push_back(&files_brief.files[i]); + } continue; } assert(level > 0); assert(files_brief.num_files > 0); - // identify the file position for starting key - const uint64_t idx_start = FindFileInRange( - v->cfd_->internal_comparator(), files_brief, start, - /*start=*/0, static_cast(files_brief.num_files - 1)); - assert(idx_start < files_brief.num_files); - - // scan all files from the starting position until the ending position - // inferred from the sorted order - for (uint64_t i = idx_start; i < files_brief.num_files; i++) { - uint64_t val; - val = ApproximateSize(v, files_brief.files[i], end); - if (!val) { - // the files after this will not have the range - break; - } + // identify the file position for start key + const int idx_start = + FindFileInRange(icmp, files_brief, start, 0, + static_cast(files_brief.num_files - 1)); + assert(static_cast(idx_start) < files_brief.num_files); - size += val; + // identify the file position for end key + int idx_end = idx_start; + if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) { + idx_end = + FindFileInRange(icmp, files_brief, end, idx_start, + static_cast(files_brief.num_files - 1)); + } + assert(idx_end >= idx_start && + static_cast(idx_end) < files_brief.num_files); - if (i == idx_start) { - // subtract the bytes needed to be scanned to get to the starting - // key - val = ApproximateSize(v, files_brief.files[i], start); - assert(size >= val); - size -= val; - } + // scan all files from the starting index to the ending index + // (inferred from the sorted order) + + // first scan all the intermediate full files (excluding first and last) + for (int i = idx_start + 1; i < idx_end; ++i) { + uint64_t file_size = files_brief.files[i].fd.GetFileSize(); + // The entire file falls into the range, so we can just take its size. + assert(file_size == + ApproximateSize(v, files_brief.files[i], start, end, caller)); + total_full_size += file_size; + } + + // save the first and the last files (which may be the same file), so we + // can scan them later. + first_files.push_back(&files_brief.files[idx_start]); + if (idx_start != idx_end) { + // we need to estimate size for both files, only if they are different + last_files.push_back(&files_brief.files[idx_end]); } } - return size; -} + // The sum of all file sizes that intersect the [start, end] keys range. + uint64_t total_intersecting_size = 0; + for (const auto* file_ptr : first_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } + for (const auto* file_ptr : last_files) { + total_intersecting_size += file_ptr->fd.GetFileSize(); + } -uint64_t VersionSet::ApproximateSizeLevel0(Version* v, - const LevelFilesBrief& files_brief, - const Slice& key_start, - const Slice& key_end) { - // level 0 files are not in sorted order, we need to iterate through - // the list to compute the total bytes that require scanning - uint64_t size = 0; - for (size_t i = 0; i < files_brief.num_files; i++) { - const uint64_t start = ApproximateSize(v, files_brief.files[i], key_start); - const uint64_t end = ApproximateSize(v, files_brief.files[i], key_end); - assert(end >= start); - size += end - start; + // Now scan all the first & last files at each level, and estimate their size. + // If the total_intersecting_size is less than X% of the total_full_size - we + // want to approximate the result in order to avoid the costly binary search + // inside ApproximateSize. We use half of file size as an approximation below. + + const double margin = options.files_size_error_margin; + if (margin > 0 && total_intersecting_size < + static_cast(total_full_size * margin)) { + total_full_size += total_intersecting_size / 2; + } else { + // Estimate for all the first files, at each level + for (const auto file_ptr : first_files) { + total_full_size += ApproximateSize(v, *file_ptr, start, end, caller); + } + + // Estimate for all the last files, at each level + for (const auto file_ptr : last_files) { + // We could use ApproximateSize here, but calling ApproximateOffsetOf + // directly is just more efficient. + total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller); + } } - return size; + + return total_full_size; } -uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& key) { +uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, + const Slice& key, + TableReaderCaller caller) { // pre-condition assert(v); + const auto& icmp = v->cfd_->internal_comparator(); uint64_t result = 0; - if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) { + if (icmp.Compare(f.largest_key, key) <= 0) { // Entire file is before "key", so just add the file size result = f.fd.GetFileSize(); - } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) { + } else if (icmp.Compare(f.smallest_key, key) > 0) { // Entire file is after "key", so ignore result = 0; } else { // "key" falls in the range for this table. Add the // approximate offset of "key" within the table. - TableReader* table_reader_ptr; - InternalIterator* iter = v->cfd_->table_cache()->NewIterator( - ReadOptions(), v->env_options_, v->cfd_->internal_comparator(), - *f.file_metadata, nullptr /* range_del_agg */, - v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr); - if (table_reader_ptr != nullptr) { - result = table_reader_ptr->ApproximateOffsetOf(key); + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache != nullptr) { + result = table_cache->ApproximateOffsetOf( + key, f.file_metadata->fd, caller, icmp, + v->GetMutableCFOptions().prefix_extractor.get()); } - delete iter; } return result; } +uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, + const Slice& start, const Slice& end, + TableReaderCaller caller) { + // pre-condition + assert(v); + const auto& icmp = v->cfd_->internal_comparator(); + assert(icmp.Compare(start, end) <= 0); + + if (icmp.Compare(f.largest_key, start) <= 0 || + icmp.Compare(f.smallest_key, end) > 0) { + // Entire file is before or after the start/end keys range + return 0; + } + + if (icmp.Compare(f.smallest_key, start) >= 0) { + // Start of the range is before the file start - approximate by end offset + return ApproximateOffsetOf(v, f, end, caller); + } + + if (icmp.Compare(f.largest_key, end) < 0) { + // End of the range is after the file end - approximate by subtracting + // start offset from the file size + uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller); + assert(f.fd.GetFileSize() >= start_offset); + return f.fd.GetFileSize() - start_offset; + } + + // The interval falls entirely in the range for this file. + TableCache* table_cache = v->cfd_->table_cache(); + if (table_cache == nullptr) { + return 0; + } + return table_cache->ApproximateSize( + start, end, f.file_metadata->fd, caller, icmp, + v->GetMutableCFOptions().prefix_extractor.get()); +} + void VersionSet::AddLiveFiles(std::vector* live_list) { // pre-calculate space requirement int64_t total_files = 0; @@ -4922,10 +5265,12 @@ InternalIterator* VersionSet::MakeInputIterator( read_options, env_options_compactions, cfd->internal_comparator(), *flevel->files[i].file_metadata, range_del_agg, c->mutable_cf_options()->prefix_extractor.get(), - nullptr /* table_reader_ptr */, - nullptr /* no per level latency histogram */, - true /* for_compaction */, nullptr /* arena */, - false /* skip_filters */, static_cast(which) /* level */); + /*table_reader_ptr=*/nullptr, + /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction, + /*arena=*/nullptr, + /*skip_filters=*/false, /*level=*/static_cast(which), + /*smallest_compaction_key=*/nullptr, + /*largest_compaction_key=*/nullptr); } } else { // Create concatenating iterator for the files from this level @@ -4933,10 +5278,10 @@ InternalIterator* VersionSet::MakeInputIterator( cfd->table_cache(), read_options, env_options_compactions, cfd->internal_comparator(), c->input_levels(which), c->mutable_cf_options()->prefix_extractor.get(), - false /* should_sample */, - nullptr /* no per level latency histogram */, - true /* for_compaction */, false /* skip_filters */, - static_cast(which) /* level */, range_del_agg, + /*should_sample=*/false, + /*no per level latency histogram=*/nullptr, + TableReaderCaller::kCompaction, /*skip_filters=*/false, + /*level=*/static_cast(which), range_del_agg, c->boundaries(which)); } } @@ -5040,7 +5385,9 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { assert(!cfd->ioptions()->cf_paths.empty()); filemetadata.db_path = cfd->ioptions()->cf_paths.back().path; } - filemetadata.name = MakeTableFileName("", file->fd.GetNumber()); + const uint64_t file_number = file->fd.GetNumber(); + filemetadata.name = MakeTableFileName("", file_number); + filemetadata.file_number = file_number; filemetadata.level = level; filemetadata.size = static_cast(file->fd.GetFileSize()); filemetadata.smallestkey = file->smallest.user_key().ToString(); @@ -5052,6 +5399,7 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { filemetadata.being_compacted = file->being_compacted; filemetadata.num_entries = file->num_entries; filemetadata.num_deletions = file->num_deletions; + filemetadata.oldest_blob_file_number = file->oldest_blob_file_number; metadata->push_back(filemetadata); } } @@ -5137,7 +5485,9 @@ ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname, WriteBufferManager* write_buffer_manager, WriteController* write_controller) : VersionSet(dbname, _db_options, _env_options, table_cache, - write_buffer_manager, write_controller) {} + write_buffer_manager, write_controller, + /*block_cache_tracer=*/nullptr), + number_of_edits_to_skip_(0) {} ReactiveVersionSet::~ReactiveVersionSet() {} @@ -5168,17 +5518,6 @@ Status ReactiveVersionSet::Recover( // In recovery, nobody else can access it, so it's fine to set it to be // initialized earlier. default_cfd->set_initialized(); - - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t previous_log_number = 0; - uint32_t max_column_family = 0; - uint64_t min_log_number_to_keep = 0; std::unordered_map> builders; std::unordered_map column_families_not_found; @@ -5194,25 +5533,17 @@ Status ReactiveVersionSet::Recover( log::Reader* reader = manifest_reader->get(); int retry = 0; + VersionEdit version_edit; while (s.ok() && retry < 1) { assert(reader != nullptr); Slice record; std::string scratch; - while (s.ok() && reader->ReadRecord(&record, &scratch)) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (!s.ok()) { - break; - } - s = ApplyOneVersionEditToBuilder( - edit, cf_name_to_options, column_families_not_found, builders, - &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); - } + s = ReadAndRecover(reader, &read_buffer_, cf_name_to_options, + column_families_not_found, builders, &version_edit); if (s.ok()) { - bool enough = have_next_file && have_log_number && have_last_sequence; + bool enough = version_edit.has_next_file_number_ && + version_edit.has_log_number_ && + version_edit.has_last_sequence_; if (enough) { for (const auto& cf : column_families) { auto cfd = column_family_set_->GetColumnFamily(cf.name); @@ -5254,14 +5585,14 @@ Status ReactiveVersionSet::Recover( } if (s.ok()) { - if (!have_prev_log_number) { - previous_log_number = 0; + if (!version_edit.has_prev_log_number_) { + version_edit.prev_log_number_ = 0; } - column_family_set_->UpdateMaxColumnFamily(max_column_family); + column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_); - MarkMinLogNumberToKeep2PC(min_log_number_to_keep); - MarkFileNumberUsed(previous_log_number); - MarkFileNumberUsed(log_number); + MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_); + MarkFileNumberUsed(version_edit.prev_log_number_); + MarkFileNumberUsed(version_edit.log_number_); for (auto cfd : *column_family_set_) { assert(builders.count(cfd->GetID()) > 0); @@ -5294,11 +5625,11 @@ Status ReactiveVersionSet::Recover( !(db_options_->skip_stats_update_on_db_open)); AppendVersion(cfd, v); } - next_file_number_.store(next_file + 1); - last_allocated_sequence_ = last_sequence; - last_published_sequence_ = last_sequence; - last_sequence_ = last_sequence; - prev_log_number_ = previous_log_number; + next_file_number_.store(version_edit.next_file_number_ + 1); + last_allocated_sequence_ = version_edit.last_sequence_; + last_published_sequence_ = version_edit.last_sequence_; + last_sequence_ = version_edit.last_sequence_; + prev_log_number_ = version_edit.prev_log_number_; for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { continue; @@ -5320,17 +5651,7 @@ Status ReactiveVersionSet::ReadAndApply( mu->AssertHeld(); Status s; - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t previous_log_number = 0; - uint32_t max_column_family = 0; - uint64_t min_log_number_to_keep = 0; - + uint64_t applied_edits = 0; while (s.ok()) { Slice record; std::string scratch; @@ -5342,73 +5663,50 @@ Status ReactiveVersionSet::ReadAndApply( if (!s.ok()) { break; } - ColumnFamilyData* cfd = - column_family_set_->GetColumnFamily(edit.column_family_); - // If we cannot find this column family in our column family set, then it - // may be a new column family created by the primary after the secondary - // starts. Ignore it for now. - if (nullptr == cfd) { + + // Skip the first VersionEdits of each MANIFEST generated by + // VersionSet::WriteCurrentStatetoManifest. + if (number_of_edits_to_skip_ > 0) { + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); + if (cfd != nullptr && !cfd->IsDropped()) { + --number_of_edits_to_skip_; + } continue; } - if (active_version_builders_.find(edit.column_family_) == - active_version_builders_.end()) { - std::unique_ptr builder_guard( - new BaseReferencedVersionBuilder(cfd)); - active_version_builders_.insert( - std::make_pair(edit.column_family_, std::move(builder_guard))); - } - s = ApplyOneVersionEditToBuilder( - edit, &have_log_number, &log_number, &have_prev_log_number, - &previous_log_number, &have_next_file, &next_file, - &have_last_sequence, &last_sequence, &min_log_number_to_keep, - &max_column_family); + + s = read_buffer_.AddEdit(&edit); if (!s.ok()) { break; } - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - s = builder->LoadTableHandlers( - cfd->internal_stats(), db_options_->max_file_opening_threads, - false /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, - cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); - TEST_SYNC_POINT_CALLBACK( - "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", &s); - if (!s.ok() && !s.IsPathNotFound()) { - break; - } else if (s.IsPathNotFound()) { - s = Status::OK(); - } else { // s.ok() == true - auto version = new Version(cfd, this, env_options_, - *cfd->GetLatestMutableCFOptions(), - current_version_number_++); - builder->SaveTo(version->storage_info()); - version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); - AppendVersion(cfd, version); - active_version_builders_.erase(builder_iter); - if (cfds_changed->count(cfd) == 0) { - cfds_changed->insert(cfd); + VersionEdit temp_edit; + if (edit.is_in_atomic_group_) { + if (read_buffer_.IsFull()) { + // Apply edits in an atomic group when we have read all edits in the + // group. + for (auto& e : read_buffer_.replay_buffer()) { + s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit); + if (!s.ok()) { + break; + } + applied_edits++; + } + if (!s.ok()) { + break; + } + read_buffer_.Clear(); + } + } else { + // Apply a normal edit immediately. + s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit); + if (s.ok()) { + applied_edits++; } } - if (have_next_file) { - next_file_number_.store(next_file + 1); - } - if (have_last_sequence) { - last_allocated_sequence_ = last_sequence; - last_published_sequence_ = last_sequence; - last_sequence_ = last_sequence; - } - if (have_prev_log_number) { - prev_log_number_ = previous_log_number; - MarkFileNumberUsed(previous_log_number); - } - if (have_log_number) { - MarkFileNumberUsed(log_number); - } - column_family_set_->UpdateMaxColumnFamily(max_column_family); - MarkMinLogNumberToKeep2PC(min_log_number_to_keep); + } + if (!s.ok()) { + // Clear the buffer if we fail to decode/apply an edit. + read_buffer_.Clear(); } // It's possible that: // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted. @@ -5418,8 +5716,37 @@ Status ReactiveVersionSet::ReadAndApply( // find the next MANIFEST, we should exit the loop. s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); reader = manifest_reader->get(); - if (s.ok() && reader->file()->file_name() == old_manifest_path) { - break; + if (s.ok()) { + if (reader->file()->file_name() == old_manifest_path) { + // Still processing the same MANIFEST, thus no need to continue this + // loop since no record is available if we have reached here. + break; + } else { + // We have switched to a new MANIFEST whose first records have been + // generated by VersionSet::WriteCurrentStatetoManifest. Since the + // secondary instance has already finished recovering upon start, there + // is no need for the secondary to process these records. Actually, if + // the secondary were to replay these records, the secondary may end up + // adding the same SST files AGAIN to each column family, causing + // consistency checks done by VersionBuilder to fail. Therefore, we + // record the number of records to skip at the beginning of the new + // MANIFEST and ignore them. + number_of_edits_to_skip_ = 0; + for (auto* cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + // Increase number_of_edits_to_skip by 2 because + // WriteCurrentStatetoManifest() writes 2 version edits for each + // column family at the beginning of the newly-generated MANIFEST. + // TODO(yanqin) remove hard-coded value. + if (db_options_->write_dbid_to_manifest) { + number_of_edits_to_skip_ += 3; + } else { + number_of_edits_to_skip_ += 2; + } + } + } } } @@ -5437,52 +5764,112 @@ Status ReactiveVersionSet::ReadAndApply( } } } + TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits", + &applied_edits); return s; } Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( - VersionEdit& edit, bool* have_log_number, uint64_t* log_number, - bool* have_prev_log_number, uint64_t* previous_log_number, - bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, - SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, - uint32_t* max_column_family) { - ColumnFamilyData* cfd = nullptr; - Status status; + VersionEdit& edit, std::unordered_set* cfds_changed, + VersionEdit* version_edit) { + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); + + // If we cannot find this column family in our column family set, then it + // may be a new column family created by the primary after the secondary + // starts. It is also possible that the secondary instance opens only a subset + // of column families. Ignore it for now. + if (nullptr == cfd) { + return Status::OK(); + } + if (active_version_builders_.find(edit.column_family_) == + active_version_builders_.end() && + !cfd->IsDropped()) { + std::unique_ptr builder_guard( + new BaseReferencedVersionBuilder(cfd)); + active_version_builders_.insert( + std::make_pair(edit.column_family_, std::move(builder_guard))); + } + + auto builder_iter = active_version_builders_.find(edit.column_family_); + assert(builder_iter != active_version_builders_.end()); + auto builder = builder_iter->second->version_builder(); + assert(builder != nullptr); + if (edit.is_column_family_add_) { // TODO (yanqin) for now the secondary ignores column families created // after Open. This also simplifies handling of switching to a new MANIFEST // and processing the snapshot of the system at the beginning of the // MANIFEST. - return Status::OK(); } else if (edit.is_column_family_drop_) { - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // Drop a CF created by primary after secondary starts? Then ignore - if (cfd == nullptr) { - return Status::OK(); - } // Drop the column family by setting it to be 'dropped' without destroying // the column family handle. + // TODO (haoyu) figure out how to handle column faimly drop for + // secondary instance. (Is it possible that the ref count for cfd is 0 but + // the ref count for its versions is higher than 0?) cfd->SetDropped(); if (cfd->Unref()) { delete cfd; cfd = nullptr; } + active_version_builders_.erase(builder_iter); } else { - cfd = column_family_set_->GetColumnFamily(edit.column_family_); - // Operation on a CF created after Open? Then ignore - if (cfd == nullptr) { - return Status::OK(); + Status s = builder->Apply(&edit); + if (!s.ok()) { + return s; } - auto builder_iter = active_version_builders_.find(edit.column_family_); - assert(builder_iter != active_version_builders_.end()); - auto builder = builder_iter->second->version_builder(); - assert(builder != nullptr); - builder->Apply(&edit); } - return ExtractInfoFromVersionEdit( - cfd, edit, have_log_number, log_number, have_prev_log_number, - previous_log_number, have_next_file, next_file, have_last_sequence, - last_sequence, min_log_number_to_keep, max_column_family); + Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit); + if (!s.ok()) { + return s; + } + + if (cfd != nullptr && !cfd->IsDropped()) { + s = builder->LoadTableHandlers( + cfd->internal_stats(), db_options_->max_file_opening_threads, + false /* prefetch_index_and_filter_in_cache */, + false /* is_initial_load */, + cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); + TEST_SYNC_POINT_CALLBACK( + "ReactiveVersionSet::ApplyOneVersionEditToBuilder:" + "AfterLoadTableHandlers", + &s); + + if (s.ok()) { + auto version = new Version(cfd, this, env_options_, + *cfd->GetLatestMutableCFOptions(), + current_version_number_++); + builder->SaveTo(version->storage_info()); + version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true); + AppendVersion(cfd, version); + active_version_builders_.erase(builder_iter); + if (cfds_changed->count(cfd) == 0) { + cfds_changed->insert(cfd); + } + } else if (s.IsPathNotFound()) { + s = Status::OK(); + } + // Some other error has occurred during LoadTableHandlers. + } + + if (version_edit->has_next_file_number()) { + next_file_number_.store(version_edit->next_file_number_ + 1); + } + if (version_edit->has_last_sequence_) { + last_allocated_sequence_ = version_edit->last_sequence_; + last_published_sequence_ = version_edit->last_sequence_; + last_sequence_ = version_edit->last_sequence_; + } + if (version_edit->has_prev_log_number_) { + prev_log_number_ = version_edit->prev_log_number_; + MarkFileNumberUsed(version_edit->prev_log_number_); + } + if (version_edit->has_log_number_) { + MarkFileNumberUsed(version_edit->log_number_); + } + column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_); + MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_); + return s; } Status ReactiveVersionSet::MaybeSwitchManifest( @@ -5492,7 +5879,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest( Status s; do { std::string manifest_path; - s = GetCurrentManifestPath(&manifest_path); + s = GetCurrentManifestPath(dbname_, env_, &manifest_path, + &manifest_file_number_); std::unique_ptr manifest_file; if (s.ok()) { if (nullptr == manifest_reader->get() || @@ -5514,7 +5902,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest( std::unique_ptr manifest_file_reader; if (s.ok()) { manifest_file_reader.reset( - new SequentialFileReader(std::move(manifest_file), manifest_path)); + new SequentialFileReader(std::move(manifest_file), manifest_path, + db_options_->log_readahead_size)); manifest_reader->reset(new log::FragmentBufferedReader( nullptr, std::move(manifest_file_reader), reporter, true /* checksum */, 0 /* log_number */)); @@ -5523,8 +5912,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest( // TODO (yanqin) every time we switch to a new MANIFEST, we clear the // active_version_builders_ map because we choose to construct the // versions from scratch, thanks to the first part of each MANIFEST - // written by VersionSet::WriteSnapshot. This is not necessary, but we - // choose this at present for the sake of simplicity. + // written by VersionSet::WriteCurrentStatetoManifest. This is not + // necessary, but we choose this at present for the sake of simplicity. active_version_builders_.clear(); } } while (s.IsPathNotFound()); diff --git a/db/version_set.h b/db/version_set.h index d82c5b47291..758bd5e5d32 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -29,8 +29,8 @@ #include #include "db/column_family.h" -#include "db/compaction.h" -#include "db/compaction_picker.h" +#include "db/compaction/compaction.h" +#include "db/compaction/compaction_picker.h" #include "db/dbformat.h" #include "db/file_indexer.h" #include "db/log_reader.h" @@ -46,6 +46,7 @@ #include "rocksdb/env.h" #include "table/get_context.h" #include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" namespace rocksdb { @@ -62,9 +63,16 @@ class VersionSet; class WriteBufferManager; class MergeContext; class ColumnFamilySet; -class TableCache; class MergeIteratorBuilder; +// VersionEdit is always supposed to be valid and it is used to point at +// entries in Manifest. Ideally it should not be used as a container to +// carry around few of its fields as function params because it can cause +// readers to think it's a valid entry from Manifest. To avoid that confusion +// introducing VersionEditParams to simply carry around multiple VersionEdit +// params. It need not point to a valid record in Manifest. +using VersionEditParams = VersionEdit; + // Return the smallest index i such that file_level.files[i]->largest >= key. // Return file_level.num_files if there is no such file. // REQUIRES: "file_level.files" contains a sorted list of @@ -91,6 +99,9 @@ extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, const std::vector& files, Arena* arena); +// Information of the storage associated with each Version, including number of +// levels of LSM tree, files information at each level, files marked for +// compaction, etc. class VersionStorageInfo { public: VersionStorageInfo(const InternalKeyComparator* internal_comparator, @@ -98,6 +109,9 @@ class VersionStorageInfo { CompactionStyle compaction_style, VersionStorageInfo* src_vstorage, bool _force_consistency_checks); + // No copying allowed + VersionStorageInfo(const VersionStorageInfo&) = delete; + void operator=(const VersionStorageInfo&) = delete; ~VersionStorageInfo(); void Reserve(int level, size_t size) { files_[level].reserve(size); } @@ -298,6 +312,10 @@ class VersionStorageInfo { return files_marked_for_periodic_compaction_; } + void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) { + files_marked_for_periodic_compaction_.emplace_back(level, f); + } + // REQUIRES: This version has been saved (see VersionSet::SaveTo) // REQUIRES: DB mutex held during access const autovector>& @@ -531,12 +549,11 @@ class VersionStorageInfo { friend class Version; friend class VersionSet; - // No copying allowed - VersionStorageInfo(const VersionStorageInfo&) = delete; - void operator=(const VersionStorageInfo&) = delete; }; using MultiGetRange = MultiGetContext::Range; +// A column family's version consists of the SST files owned by the column +// family at a certain point in time. class Version { public: // Append to *iters a sequence of iterators that will @@ -555,28 +572,33 @@ class Version { const Slice& largest_user_key, int level, bool* overlap); - // Lookup the value for key. If found, store it in *val and - // return OK. Else return a non-OK status. - // Uses *operands to store merge_operator operations to apply later. + // Lookup the value for key or get all merge operands for key. + // If do_merge = true (default) then lookup value for key. + // Behavior if do_merge = true: + // If found, store it in *value and + // return OK. Else return a non-OK status. + // Uses *operands to store merge_operator operations to apply later. // - // If the ReadOptions.read_tier is set to do a read-only fetch, then - // *value_found will be set to false if it cannot be determined whether - // this value exists without doing IO. + // If the ReadOptions.read_tier is set to do a read-only fetch, then + // *value_found will be set to false if it cannot be determined whether + // this value exists without doing IO. // - // If the key is Deleted, *status will be set to NotFound and + // If the key is Deleted, *status will be set to NotFound and // *key_exists will be set to true. - // If no key was found, *status will be set to NotFound and + // If no key was found, *status will be set to NotFound and // *key_exists will be set to false. - // If seq is non-null, *seq will be set to the sequence number found - // for the key if a key was found. - // + // If seq is non-null, *seq will be set to the sequence number found + // for the key if a key was found. + // Behavior if do_merge = false + // If the key has any merge operands then store them in + // merge_context.operands_list and don't merge the operands // REQUIRES: lock is not held void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value, Status* status, MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq, bool* value_found = nullptr, bool* key_exists = nullptr, SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr, - bool* is_blob = nullptr); + bool* is_blob = nullptr, bool do_merge = true); void MultiGet(const ReadOptions&, MultiGetRange* range, ReadCallback* callback = nullptr, bool* is_blob = nullptr); @@ -620,6 +642,11 @@ class Version { Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, TablePropertiesCollection* props) const; + // Print summary of range delete tombstones in SST files into out_str, + // with maximum max_entries_to_print entries printed out. + Status TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str); + // REQUIRES: lock is held // On success, "tp" will contains the aggregated table property among // the table properties of all sst files in this version. @@ -649,7 +676,11 @@ class Version { uint64_t GetSstFilesSize(); - MutableCFOptions GetMutableCFOptions() { return mutable_cf_options_; } + // Retrieves the file_creation_time of the oldest file in the DB. + // Prerequisite for this API is max_open_files = -1 + void GetCreationTimeOfOldestFile(uint64_t* creation_time); + + const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; } private: Env* env_; @@ -711,8 +742,8 @@ class Version { ~Version(); // No copying allowed - Version(const Version&); - void operator=(const Version&); + Version(const Version&) = delete; + void operator=(const Version&) = delete; }; struct ObsoleteFileInfo { @@ -747,12 +778,37 @@ struct ObsoleteFileInfo { class BaseReferencedVersionBuilder; +class AtomicGroupReadBuffer { + public: + Status AddEdit(VersionEdit* edit); + void Clear(); + bool IsFull() const; + bool IsEmpty() const; + + uint64_t TEST_read_edits_in_atomic_group() const { + return read_edits_in_atomic_group_; + } + std::vector& replay_buffer() { return replay_buffer_; } + + private: + uint64_t read_edits_in_atomic_group_ = 0; + std::vector replay_buffer_; +}; + +// VersionSet is the collection of versions of all the column families of the +// database. Each database owns one VersionSet. A VersionSet has access to all +// column families via ColumnFamilySet, i.e. set of the column families. class VersionSet { public: VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options, const EnvOptions& env_options, Cache* table_cache, WriteBufferManager* write_buffer_manager, - WriteController* write_controller); + WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer); + // No copying allowed + VersionSet(const VersionSet&) = delete; + void operator=(const VersionSet&) = delete; + virtual ~VersionSet(); // Apply *edit to the current version to form a new descriptor that @@ -807,13 +863,15 @@ class VersionSet { bool new_descriptor_log = false, const ColumnFamilyOptions* new_cf_options = nullptr); - Status GetCurrentManifestPath(std::string* manifest_filename); + static Status GetCurrentManifestPath(const std::string& dbname, Env* env, + std::string* manifest_filename, + uint64_t* manifest_file_number); // Recover the last saved descriptor from persistent storage. // If read_only == true, Recover() will not complain if some column families // are not opened Status Recover(const std::vector& column_families, - bool read_only = false); + bool read_only = false, std::string* db_id = nullptr); // Reads a manifest file and returns a list of column families in // column_families. @@ -952,10 +1010,12 @@ class VersionSet { void AddLiveFiles(std::vector* live_list); // Return the approximate size of data to be scanned for range [start, end) - // in levels [start_level, end_level). If end_level == 0 it will search + // in levels [start_level, end_level). If end_level == -1 it will search // through all non-empty levels - uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, - int start_level = 0, int end_level = -1); + uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, + const Slice& start, const Slice& end, + int start_level, int end_level, + TableReaderCaller caller); // Return the size of the current manifest file uint64_t manifest_file_size() const { return manifest_file_size_; } @@ -1003,21 +1063,33 @@ class VersionSet { } }; - // ApproximateSize helper - uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, - const Slice& start, const Slice& end); + // Returns approximated offset of a key in a file for a given version. + uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, + const Slice& key, TableReaderCaller caller); + // Returns approximated data size between start and end keys in a file + // for a given version. uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& key); + const Slice& start, const Slice& end, + TableReaderCaller caller); // Save current contents to *log - Status WriteSnapshot(log::Writer* log); + Status WriteCurrentStateToManifest(log::Writer* log); void AppendVersion(ColumnFamilyData* column_family_data, Version* v); ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, VersionEdit* edit); + Status ReadAndRecover( + log::Reader* reader, AtomicGroupReadBuffer* read_buffer, + const std::unordered_map& + name_to_options, + std::unordered_map& column_families_not_found, + std::unordered_map< + uint32_t, std::unique_ptr>& builders, + VersionEditParams* version_edit, std::string* db_id = nullptr); + // REQUIRES db mutex Status ApplyOneVersionEditToBuilder( VersionEdit& edit, @@ -1025,22 +1097,17 @@ class VersionSet { std::unordered_map& column_families_not_found, std::unordered_map< uint32_t, std::unique_ptr>& builders, - bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number, - uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, - bool* have_last_sequence, SequenceNumber* last_sequence, - uint64_t* min_log_number_to_keep, uint32_t* max_column_family); - - Status ExtractInfoFromVersionEdit( - ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number, - uint64_t* log_number, bool* have_prev_log_number, - uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file, - bool* have_last_sequence, SequenceNumber* last_sequence, - uint64_t* min_log_number_to_keep, uint32_t* max_column_family); + VersionEditParams* version_edit); + + Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, + const VersionEdit& from_edit, + VersionEditParams* version_edit_params); std::unique_ptr column_family_set_; Env* const env_; const std::string dbname_; + std::string db_id_; const ImmutableDBOptions* const db_options_; std::atomic next_file_number_; // Any log number equal or lower than this should be ignored during recovery, @@ -1085,11 +1152,9 @@ class VersionSet { // env options for all reads and writes except compactions EnvOptions env_options_; - private: - // No copying allowed - VersionSet(const VersionSet&); - void operator=(const VersionSet&); + BlockCacheTracer* const block_cache_tracer_; + private: // REQUIRES db mutex at beginning. may release and re-acquire db mutex Status ProcessManifestWrites(std::deque& writers, InstrumentedMutex* mu, Directory* db_directory, @@ -1097,10 +1162,14 @@ class VersionSet { const ColumnFamilyOptions* new_cf_options); void LogAndApplyCFHelper(VersionEdit* edit); - void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, - VersionEdit* edit, InstrumentedMutex* mu); + Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, + VersionEdit* edit, InstrumentedMutex* mu); }; +// ReactiveVersionSet represents a collection of versions of the column +// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary, +// need to replay the MANIFEST (description log in older terms) in order to +// reconstruct and install versions. class ReactiveVersionSet : public VersionSet { public: ReactiveVersionSet(const std::string& dbname, @@ -1121,16 +1190,20 @@ class ReactiveVersionSet : public VersionSet { std::unique_ptr* manifest_reporter, std::unique_ptr* manifest_reader_status); + uint64_t TEST_read_edits_in_atomic_group() const { + return read_buffer_.TEST_read_edits_in_atomic_group(); + } + std::vector& replay_buffer() { + return read_buffer_.replay_buffer(); + } + protected: using VersionSet::ApplyOneVersionEditToBuilder; // REQUIRES db mutex Status ApplyOneVersionEditToBuilder( - VersionEdit& edit, bool* have_log_number, uint64_t* log_number, - bool* have_prev_log_number, uint64_t* previous_log_number, - bool* have_next_file, uint64_t* next_file, bool* have_last_sequence, - SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep, - uint32_t* max_column_family); + VersionEdit& edit, std::unordered_set* cfds_changed, + VersionEdit* version_edit); Status MaybeSwitchManifest( log::Reader::Reporter* reporter, @@ -1139,6 +1212,10 @@ class ReactiveVersionSet : public VersionSet { private: std::unordered_map> active_version_builders_; + AtomicGroupReadBuffer read_buffer_; + // Number of version edits to skip by ReadAndApply at the beginning of a new + // MANIFEST created by primary. + int number_of_edits_to_skip_; using VersionSet::LogAndApply; using VersionSet::Recover; diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 43924a3addd..66ad930f583 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -8,12 +8,13 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/version_set.h" +#include "db/db_impl/db_impl.h" #include "db/log_writer.h" +#include "logging/logging.h" #include "table/mock_table.h" -#include "util/logging.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -34,10 +35,12 @@ class GenerateLevelFilesBriefTest : public testing::Test { void Add(const char* smallest, const char* largest, SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100) { - FileMetaData* f = new FileMetaData; - f->fd = FileDescriptor(files_.size() + 1, 0, 0); - f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); - f->largest = InternalKey(largest, largest_seq, kTypeValue); + FileMetaData* f = new FileMetaData( + files_.size() + 1, 0, 0, + InternalKey(smallest, smallest_seq, kTypeValue), + InternalKey(largest, largest_seq, kTypeValue), smallest_seq, + largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); files_.push_back(f); } @@ -128,28 +131,24 @@ class VersionStorageInfoTest : public testing::Test { void Add(int level, uint32_t file_number, const char* smallest, const char* largest, uint64_t file_size = 0) { assert(level < vstorage_.num_levels()); - FileMetaData* f = new FileMetaData; - f->fd = FileDescriptor(file_number, 0, file_size); - f->smallest = GetInternalKey(smallest, 0); - f->largest = GetInternalKey(largest, 0); + FileMetaData* f = new FileMetaData( + file_number, 0, file_size, GetInternalKey(smallest, 0), + GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0, + /* marked_for_compact */ false, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime); f->compensated_file_size = file_size; - f->refs = 0; - f->num_entries = 0; - f->num_deletions = 0; vstorage_.AddFile(level, f); } void Add(int level, uint32_t file_number, const InternalKey& smallest, const InternalKey& largest, uint64_t file_size = 0) { assert(level < vstorage_.num_levels()); - FileMetaData* f = new FileMetaData; - f->fd = FileDescriptor(file_number, 0, file_size); - f->smallest = smallest; - f->largest = largest; + FileMetaData* f = new FileMetaData( + file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, + /* largest_seq */ 0, /* marked_for_compact */ false, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); f->compensated_file_size = file_size; - f->refs = 0; - f->num_entries = 0; - f->num_deletions = 0; vstorage_.AddFile(level, f); } @@ -607,6 +606,7 @@ class VersionSetTestBase { const static std::string kColumnFamilyName1; const static std::string kColumnFamilyName2; const static std::string kColumnFamilyName3; + int num_initial_edits_; VersionSetTestBase() : env_(Env::Default()), @@ -617,7 +617,11 @@ class VersionSetTestBase { write_buffer_manager_(db_options_.db_write_buffer_size), versions_(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)), + &write_controller_, + /*block_cache_tracer=*/nullptr)), + reactive_versions_(std::make_shared( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_)), shutting_down_(false), mock_table_factory_(std::make_shared()) { EXPECT_OK(env_->CreateDirIfMissing(dbname_)); @@ -632,6 +636,12 @@ class VersionSetTestBase { assert(last_seqno != nullptr); assert(log_writer != nullptr); VersionEdit new_db; + if (db_options_.write_dbid_to_manifest) { + DBImpl* impl = new DBImpl(DBOptions(), dbname_); + std::string db_id; + impl->GetDbIdentityFromIdentityFile(&db_id); + new_db.SetDBId(db_id); + } new_db.SetLogNumber(0); new_db.SetNextFile(2); new_db.SetLastSequence(0); @@ -653,7 +663,7 @@ class VersionSetTestBase { new_cfs.emplace_back(new_cf); } *last_seqno = last_seq; - + num_initial_edits_ = static_cast(new_cfs.size() + 1); const std::string manifest = DescriptorFileName(dbname_, 1); std::unique_ptr file; Status s = env_->NewWritableFile( @@ -686,7 +696,7 @@ class VersionSetTestBase { std::vector column_families; SequenceNumber last_seqno; std::unique_ptr log_writer; - + SetIdentityFile(env_, dbname_); PrepareManifest(&column_families, &last_seqno, &log_writer); log_writer.reset(); // Make "CURRENT" file point to the new manifest file. @@ -708,6 +718,7 @@ class VersionSetTestBase { WriteController write_controller_; WriteBufferManager write_buffer_manager_; std::shared_ptr versions_; + std::shared_ptr reactive_versions_; InstrumentedMutex mutex_; std::atomic shutting_down_; std::shared_ptr mock_table_factory_; @@ -746,7 +757,7 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { SyncPoint::GetInstance()->SetCallBack( "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) { uint32_t* cf_id = reinterpret_cast(arg); - EXPECT_EQ(0, *cf_id); + EXPECT_EQ(0u, *cf_id); ++count; }); SyncPoint::GetInstance()->EnableProcessing(); @@ -758,216 +769,388 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { EXPECT_EQ(kGroupSize - 1, count); } -TEST_F(VersionSetTest, HandleValidAtomicGroup) { - std::vector column_families; - SequenceNumber last_seqno; - std::unique_ptr log_writer; - PrepareManifest(&column_families, &last_seqno, &log_writer); +class VersionSetAtomicGroupTest : public VersionSetTestBase, + public testing::Test { + public: + VersionSetAtomicGroupTest() : VersionSetTestBase() {} - // Append multiple version edits that form an atomic group - const int kAtomicGroupSize = 3; - std::vector edits(kAtomicGroupSize); - int remaining = kAtomicGroupSize; - for (size_t i = 0; i != edits.size(); ++i) { - edits[i].SetLogNumber(0); - edits[i].SetNextFile(2); - edits[i].MarkAtomicGroup(--remaining); - edits[i].SetLastSequence(last_seqno++); - } - Status s; - for (const auto& edit : edits) { - std::string record; - edit.EncodeTo(&record); - s = log_writer->AddRecord(record); - ASSERT_OK(s); + void SetUp() override { + PrepareManifest(&column_families_, &last_seqno_, &log_writer_); + SetupTestSyncPoints(); } - log_writer.reset(); - s = SetCurrentFile(env_, dbname_, 1, nullptr); - ASSERT_OK(s); - - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); + void SetupValidAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + edits_[i].MarkAtomicGroup(--remaining); + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } - bool first_in_atomic_group = false; - bool last_in_atomic_group = false; + void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + edits_[i].MarkAtomicGroup(--remaining); + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits.front().DebugString(), - e->DebugString()); // compare based on value - first_in_atomic_group = true; - }); - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:LastInAtomicGroup", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits.back().DebugString(), - e->DebugString()); // compare based on value - EXPECT_TRUE(first_in_atomic_group); - last_in_atomic_group = true; - }); - SyncPoint::GetInstance()->EnableProcessing(); + void SetupCorruptedAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + if (i != ((size_t)atomic_group_size / 2)) { + edits_[i].MarkAtomicGroup(--remaining); + } + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } - EXPECT_OK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), - versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); - EXPECT_TRUE(first_in_atomic_group); - EXPECT_TRUE(last_in_atomic_group); -} + void SetupIncorrectAtomicGroup(int atomic_group_size) { + edits_.resize(atomic_group_size); + int remaining = atomic_group_size; + for (size_t i = 0; i != edits_.size(); ++i) { + edits_[i].SetLogNumber(0); + edits_[i].SetNextFile(2); + if (i != 1) { + edits_[i].MarkAtomicGroup(--remaining); + } else { + edits_[i].MarkAtomicGroup(remaining--); + } + edits_[i].SetLastSequence(last_seqno_++); + } + ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr)); + } -TEST_F(VersionSetTest, HandleIncompleteTrailingAtomicGroup) { - std::vector column_families; - SequenceNumber last_seqno; - std::unique_ptr log_writer; - PrepareManifest(&column_families, &last_seqno, &log_writer); + void SetupTestSyncPoints() { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) { + VersionEdit* e = reinterpret_cast(arg); + EXPECT_EQ(edits_.front().DebugString(), + e->DebugString()); // compare based on value + first_in_atomic_group_ = true; + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) { + VersionEdit* e = reinterpret_cast(arg); + EXPECT_EQ(edits_.back().DebugString(), + e->DebugString()); // compare based on value + EXPECT_TRUE(first_in_atomic_group_); + last_in_atomic_group_ = true; + }); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) { + num_recovered_edits_ = *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->SetCallBack( + "ReactiveVersionSet::ReadAndApply:AppliedEdits", + [&](void* arg) { num_applied_edits_ = *reinterpret_cast(arg); }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:AtomicGroup", + [&](void* /* arg */) { ++num_edits_in_atomic_group_; }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", + [&](void* arg) { + corrupted_edit_ = *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->SetCallBack( + "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", + [&](void* arg) { + edit_with_incorrect_group_size_ = + *reinterpret_cast(arg); + }); + SyncPoint::GetInstance()->EnableProcessing(); + } - // Append multiple version edits that form an atomic group - const int kAtomicGroupSize = 4; - const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; - std::vector edits(kNumberOfPersistedVersionEdits); - int remaining = kAtomicGroupSize; - for (size_t i = 0; i != edits.size(); ++i) { - edits[i].SetLogNumber(0); - edits[i].SetNextFile(2); - edits[i].MarkAtomicGroup(--remaining); - edits[i].SetLastSequence(last_seqno++); + void AddNewEditsToLog(int num_edits) { + for (int i = 0; i < num_edits; i++) { + std::string record; + edits_[i].EncodeTo(&record); + ASSERT_OK(log_writer_->AddRecord(record)); + } } - Status s; - for (const auto& edit : edits) { - std::string record; - edit.EncodeTo(&record); - s = log_writer->AddRecord(record); - ASSERT_OK(s); + + void TearDown() override { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + log_writer_.reset(); } - log_writer.reset(); - s = SetCurrentFile(env_, dbname_, 1, nullptr); - ASSERT_OK(s); + protected: + std::vector column_families_; + SequenceNumber last_seqno_; + std::vector edits_; + bool first_in_atomic_group_ = false; + bool last_in_atomic_group_ = false; + int num_edits_in_atomic_group_ = 0; + int num_recovered_edits_ = 0; + int num_applied_edits_ = 0; + VersionEdit corrupted_edit_; + VersionEdit edit_with_incorrect_group_size_; + std::unique_ptr log_writer_; +}; - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); +TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} - bool first_in_atomic_group = false; - bool last_in_atomic_group = false; - size_t num = 0; +TEST_F(VersionSetAtomicGroupTest, + HandleValidAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + // The recover should clean up the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits.front().DebugString(), - e->DebugString()); // compare based on value - first_in_atomic_group = true; - }); - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:LastInAtomicGroup", - [&](void* /* arg */) { last_in_atomic_group = true; }); - SyncPoint::GetInstance()->SetCallBack("VersionSet::Recover:AtomicGroup", - [&](void* /* arg */) { ++num; }); - SyncPoint::GetInstance()->EnableProcessing(); +TEST_F(VersionSetAtomicGroupTest, + HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 3; + SetupValidAtomicGroup(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + AddNewEditsToLog(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_TRUE(last_in_atomic_group_); + // The recover should clean up the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); +} - EXPECT_OK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + EXPECT_OK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); - EXPECT_TRUE(first_in_atomic_group); - EXPECT_FALSE(last_in_atomic_group); - EXPECT_EQ(kNumberOfPersistedVersionEdits, num); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); } -TEST_F(VersionSetTest, HandleCorruptedAtomicGroup) { - std::vector column_families; - SequenceNumber last_seqno; - std::unique_ptr log_writer; - PrepareManifest(&column_families, &last_seqno, &log_writer); - - // Append multiple version edits that form an atomic group +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) { const int kAtomicGroupSize = 4; - std::vector edits(kAtomicGroupSize); - int remaining = kAtomicGroupSize; - for (size_t i = 0; i != edits.size(); ++i) { - edits[i].SetLogNumber(0); - edits[i].SetNextFile(2); - if (i != (kAtomicGroupSize / 2)) { - edits[i].MarkAtomicGroup(--remaining); - } - edits[i].SetLastSequence(last_seqno++); - } - Status s; - for (const auto& edit : edits) { - std::string record; - edit.EncodeTo(&record); - s = log_writer->AddRecord(record); - ASSERT_OK(s); - } - log_writer.reset(); - - s = SetCurrentFile(env_, dbname_, 1, nullptr); - ASSERT_OK(s); + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + // Reactive version set should store the edits in the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == + kNumberOfPersistedVersionEdits); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); + // Write the last record. The reactive version set should now apply all + // edits. + std::string last_record; + edits_[kAtomicGroupSize - 1].EncodeTo(&last_record); + EXPECT_OK(log_writer_->AddRecord(last_record)); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + // Reactive version set should be empty now. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(kAtomicGroupSize, num_applied_edits_); +} - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); +TEST_F(VersionSetAtomicGroupTest, + HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1; + SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + // No edits in an atomic group. + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + // Write a few edits in an atomic group. + AddNewEditsToLog(kNumberOfPersistedVersionEdits); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_TRUE(first_in_atomic_group_); + EXPECT_FALSE(last_in_atomic_group_); + EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_); + // Reactive version set should store the edits in the replay buffer. + EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == + kNumberOfPersistedVersionEdits); + EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize); + EXPECT_EQ(num_initial_edits_, num_recovered_edits_); + EXPECT_EQ(0, num_applied_edits_); +} - bool mixed = false; - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits[kAtomicGroupSize / 2].DebugString(), e->DebugString()); - mixed = true; - }); - SyncPoint::GetInstance()->EnableProcessing(); - EXPECT_NOK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_NOK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); - EXPECT_TRUE(mixed); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); } -TEST_F(VersionSetTest, HandleIncorrectAtomicGroupSize) { - std::vector column_families; - SequenceNumber last_seqno; - std::unique_ptr log_writer; - PrepareManifest(&column_families, &last_seqno, &log_writer); +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupCorruptedAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} - // Append multiple version edits that form an atomic group +TEST_F(VersionSetAtomicGroupTest, + HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) { const int kAtomicGroupSize = 4; - std::vector edits(kAtomicGroupSize); - int remaining = kAtomicGroupSize; - for (size_t i = 0; i != edits.size(); ++i) { - edits[i].SetLogNumber(0); - edits[i].SetNextFile(2); - if (i != 1) { - edits[i].MarkAtomicGroup(--remaining); - } else { - edits[i].MarkAtomicGroup(remaining--); - } - edits[i].SetLastSequence(last_seqno++); - } - Status s; - for (const auto& edit : edits) { - std::string record; - edit.EncodeTo(&record); - s = log_writer->AddRecord(record); - ASSERT_OK(s); - } - log_writer.reset(); + SetupCorruptedAtomicGroup(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + // Write the corrupted edits. + AddNewEditsToLog(kAtomicGroupSize); + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(), + corrupted_edit_.DebugString()); +} - s = SetCurrentFile(env_, dbname_, 1, nullptr); - ASSERT_OK(s); +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + EXPECT_NOK(versions_->Recover(column_families_, false)); + EXPECT_EQ(column_families_.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + AddNewEditsToLog(kAtomicGroupSize); + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + EXPECT_EQ(column_families_.size(), + reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); +} - bool incorrect_group_size = false; - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::Recover:IncorrectAtomicGroupSize", [&](void* arg) { - VersionEdit* e = reinterpret_cast(arg); - EXPECT_EQ(edits[1].DebugString(), e->DebugString()); - incorrect_group_size = true; - }); - SyncPoint::GetInstance()->EnableProcessing(); - EXPECT_NOK(versions_->Recover(column_families, false)); - EXPECT_EQ(column_families.size(), - versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); - EXPECT_TRUE(incorrect_group_size); +TEST_F(VersionSetAtomicGroupTest, + HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) { + const int kAtomicGroupSize = 4; + SetupIncorrectAtomicGroup(kAtomicGroupSize); + InstrumentedMutex mu; + std::unordered_set cfds_changed; + std::unique_ptr manifest_reader; + std::unique_ptr manifest_reporter; + std::unique_ptr manifest_reader_status; + EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader, + &manifest_reporter, + &manifest_reader_status)); + AddNewEditsToLog(kAtomicGroupSize); + mu.Lock(); + EXPECT_OK( + reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed)); + mu.Unlock(); + EXPECT_EQ(edits_[1].DebugString(), + edit_with_incorrect_group_size_.DebugString()); } class VersionSetTestDropOneCF : public VersionSetTestBase, @@ -1088,7 +1271,6 @@ INSTANTIATE_TEST_CASE_P( testing::Values(VersionSetTestBase::kColumnFamilyName1, VersionSetTestBase::kColumnFamilyName2, VersionSetTestBase::kColumnFamilyName3)); - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 62511819e4d..d3f59a10e3f 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -9,32 +9,28 @@ #include "db/wal_manager.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include -#include +#include #include +#include #include "db/log_reader.h" #include "db/log_writer.h" #include "db/transaction_log_impl.h" #include "db/write_batch_internal.h" +#include "file/file_util.h" +#include "file/filename.h" +#include "file/sequence_file_reader.h" +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/write_batch.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/coding.h" -#include "util/file_reader_writer.h" -#include "util/file_util.h" -#include "util/filename.h" -#include "util/logging.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { @@ -191,7 +187,8 @@ void WalManager::PurgeObsoleteWALFiles() { continue; } if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) { - s = DeleteDBFile(&db_options_, file_path, archival_dir, false); + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); @@ -217,7 +214,8 @@ void WalManager::PurgeObsoleteWALFiles() { log_file_size = std::max(log_file_size, file_size); ++log_files_num; } else { - s = DeleteDBFile(&db_options_, file_path, archival_dir, false); + s = DeleteDBFile(&db_options_, file_path, archival_dir, false, + /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s", file_path.c_str(), @@ -257,7 +255,8 @@ void WalManager::PurgeObsoleteWALFiles() { for (size_t i = 0; i < files_del_num; ++i) { std::string const file_path = archived_logs[i]->PathName(); s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path, - db_options_.wal_dir, false); + db_options_.wal_dir, false, + /*force_fg=*/!wal_in_db_path_); if (!s.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s", file_path.c_str(), s.ToString().c_str()); @@ -281,17 +280,6 @@ void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) { s.ToString().c_str()); } -namespace { -struct CompareLogByPointer { - bool operator()(const std::unique_ptr& a, - const std::unique_ptr& b) { - LogFileImpl* a_impl = static_cast_with_check(a.get()); - LogFileImpl* b_impl = static_cast_with_check(b.get()); - return *a_impl < *b_impl; - } -}; -} - Status WalManager::GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files, WalFileType log_type) { @@ -324,14 +312,15 @@ Status WalManager::GetSortedWalsOfType(const std::string& path, uint64_t size_bytes; s = env_->GetFileSize(LogFileName(path, number), &size_bytes); // re-try in case the alive log file has been moved to archive. - std::string archived_file = ArchivedLogFileName(path, number); - if (!s.ok() && log_type == kAliveLogFile && - env_->FileExists(archived_file).ok()) { - s = env_->GetFileSize(archived_file, &size_bytes); - if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) { - // oops, the file just got deleted from archived dir! move on - s = Status::OK(); - continue; + if (!s.ok() && log_type == kAliveLogFile) { + std::string archived_file = ArchivedLogFileName(path, number); + if (env_->FileExists(archived_file).ok()) { + s = env_->GetFileSize(archived_file, &size_bytes); + if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) { + // oops, the file just got deleted from archived dir! move on + s = Status::OK(); + continue; + } } } if (!s.ok()) { @@ -342,8 +331,15 @@ Status WalManager::GetSortedWalsOfType(const std::string& path, new LogFileImpl(number, log_type, sequence, size_bytes))); } } - CompareLogByPointer compare_log_files; - std::sort(log_files.begin(), log_files.end(), compare_log_files); + std::sort( + log_files.begin(), log_files.end(), + [](const std::unique_ptr& a, const std::unique_ptr& b) { + LogFileImpl* a_impl = + static_cast_with_check(a.get()); + LogFileImpl* b_impl = + static_cast_with_check(b.get()); + return *a_impl < *b_impl; + }); return status; } @@ -393,7 +389,7 @@ Status WalManager::ReadFirstRecord(const WalFileType type, if (type == kAliveLogFile) { std::string fname = LogFileName(db_options_.wal_dir, number); s = ReadFirstLine(fname, number, sequence); - if (env_->FileExists(fname).ok() && !s.ok()) { + if (!s.ok() && env_->FileExists(fname).ok()) { // return any error that is not caused by non-existing file return s; } @@ -419,6 +415,32 @@ Status WalManager::ReadFirstRecord(const WalFileType type, return s; } +Status WalManager::GetLiveWalFile(uint64_t number, + std::unique_ptr* log_file) { + if (!log_file) { + return Status::InvalidArgument("log_file not preallocated."); + } + + if (!number) { + return Status::PathNotFound("log file not available"); + } + + Status s; + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(db_options_.wal_dir, number), &size_bytes); + + if (!s.ok()) { + return s; + } + + log_file->reset(new LogFileImpl(number, kAliveLogFile, + 0, // SequenceNumber + size_bytes)); + + return Status::OK(); +} + // the function returns status.ok() and sequence == 0 if the file exists, but is // empty Status WalManager::ReadFirstLine(const std::string& fname, diff --git a/db/wal_manager.h b/db/wal_manager.h index 6caf1640c06..97211f0003a 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -18,6 +18,7 @@ #include #include "db/version_set.h" +#include "file/file_util.h" #include "options/db_options.h" #include "port/port.h" #include "rocksdb/env.h" @@ -28,6 +29,10 @@ namespace rocksdb { #ifndef ROCKSDB_LITE + +// WAL manager provides the abstraction for reading the WAL files as a single +// unit. Internally, it opens and reads the files using Reader or Writer +// abstraction. class WalManager { public: WalManager(const ImmutableDBOptions& db_options, @@ -36,10 +41,13 @@ class WalManager { env_options_(env_options), env_(db_options.env), purge_wal_files_last_run_(0), - seq_per_batch_(seq_per_batch) {} + seq_per_batch_(seq_per_batch), + wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {} Status GetSortedWalFiles(VectorLogPtr& files); + // Allow user to tail transaction log to find all recent changes to the + // database that are newer than `seq_number`. Status GetUpdatesSince( SequenceNumber seq_number, std::unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options, @@ -51,6 +59,8 @@ class WalManager { Status DeleteFile(const std::string& fname, uint64_t number); + Status GetLiveWalFile(uint64_t number, std::unique_ptr* log_file); + Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number, SequenceNumber* sequence) { return ReadFirstRecord(type, number, sequence); @@ -91,6 +101,8 @@ class WalManager { bool seq_per_batch_; + bool wal_in_db_path_; + // obsolete files will be deleted every this seconds if ttl deletion is // enabled and archive size_limit is disabled. static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600; diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 379f12f52aa..4f15a064d1f 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -13,16 +13,16 @@ #include "rocksdb/write_buffer_manager.h" #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/log_writer.h" #include "db/version_set.h" #include "db/wal_manager.h" #include "env/mock_env.h" +#include "file/writable_file_writer.h" #include "table/mock_table.h" -#include "util/file_reader_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -50,7 +50,8 @@ class WalManagerTest : public testing::Test { versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, - &write_controller_)); + &write_controller_, + /*block_cache_tracer=*/nullptr)); wal_manager_.reset(new WalManager(db_options_, env_options_)); } @@ -293,6 +294,29 @@ TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) { ASSERT_TRUE(!iter->Valid()); } +TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) { + Init(); + CreateArchiveLogs(2, 100); + auto iter = OpenTransactionLogIter(0); + CreateArchiveLogs(1, 100); + int i = 0; + for (; iter->Valid(); iter->Next()) { + i++; + } + ASSERT_EQ(i, 200); + // A new log file was added after the iterator was created. + // TryAgain indicates a new iterator is needed to fetch the new data + ASSERT_TRUE(iter->status().IsTryAgain()); + + iter = OpenTransactionLogIter(0); + i = 0; + for (; iter->Valid(); iter->Next()) { + i++; + } + ASSERT_EQ(i, 300); + ASSERT_TRUE(iter->status().ok()); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/write_batch.cc b/db/write_batch.cc index 939b595305b..350c1a1c072 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -39,20 +39,23 @@ #include #include #include +#include #include #include "db/column_family.h" -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" #include "db/flush_scheduler.h" #include "db/memtable.h" #include "db/merge_context.h" #include "db/snapshot_impl.h" +#include "db/trim_history_scheduler.h" #include "db/write_batch_internal.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/merge_operator.h" #include "util/autovector.h" +#include "util/cast_util.h" #include "util/coding.h" #include "util/duplicate_detector.h" #include "util/string_util.h" @@ -135,6 +138,105 @@ struct BatchContentClassifier : public WriteBatch::Handler { } }; +class TimestampAssigner : public WriteBatch::Handler { + public: + explicit TimestampAssigner(const Slice& ts) + : timestamp_(ts), timestamps_(kEmptyTimestampList) {} + explicit TimestampAssigner(const std::vector& ts_list) + : timestamps_(ts_list) { + SanityCheck(); + } + ~TimestampAssigner() override {} + + Status PutCF(uint32_t, const Slice& key, const Slice&) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status DeleteCF(uint32_t, const Slice& key) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status SingleDeleteCF(uint32_t, const Slice& key) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status DeleteRangeCF(uint32_t, const Slice& begin_key, + const Slice& end_key) override { + AssignTimestamp(begin_key); + AssignTimestamp(end_key); + ++idx_; + return Status::OK(); + } + + Status MergeCF(uint32_t, const Slice& key, const Slice&) override { + AssignTimestamp(key); + ++idx_; + return Status::OK(); + } + + Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override { + // TODO (yanqin): support blob db in the future. + return Status::OK(); + } + + Status MarkBeginPrepare(bool) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkEndPrepare(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkCommit(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { + // TODO (yanqin): support in the future. + return Status::OK(); + } + + private: + void SanityCheck() const { + assert(!timestamps_.empty()); +#ifndef NDEBUG + const size_t ts_sz = timestamps_[0].size(); + for (size_t i = 1; i != timestamps_.size(); ++i) { + assert(ts_sz == timestamps_[i].size()); + } +#endif // !NDEBUG + } + + void AssignTimestamp(const Slice& key) { + assert(timestamps_.empty() || idx_ < timestamps_.size()); + const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_]; + size_t ts_sz = ts.size(); + char* ptr = const_cast(key.data() + key.size() - ts_sz); + memcpy(ptr, ts.data(), ts_sz); + } + + static const std::vector kEmptyTimestampList; + const Slice timestamp_; + const std::vector& timestamps_; + size_t idx_ = 0; + + // No copy or move. + TimestampAssigner(const TimestampAssigner&) = delete; + TimestampAssigner(TimestampAssigner&&) = delete; + TimestampAssigner& operator=(const TimestampAssigner&) = delete; + TimestampAssigner&& operator=(TimestampAssigner&&) = delete; +}; +const std::vector TimestampAssigner::kEmptyTimestampList; + } // anon namespace struct SavePoints { @@ -142,7 +244,15 @@ struct SavePoints { }; WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes) - : content_flags_(0), max_bytes_(max_bytes), rep_() { + : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) { + rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) + ? reserved_bytes + : WriteBatchInternal::kHeader); + rep_.resize(WriteBatchInternal::kHeader); +} + +WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz) + : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) { rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ? reserved_bytes : WriteBatchInternal::kHeader); rep_.resize(WriteBatchInternal::kHeader); @@ -151,18 +261,21 @@ WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes) WriteBatch::WriteBatch(const std::string& rep) : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), - rep_(rep) {} + rep_(rep), + timestamp_size_(0) {} WriteBatch::WriteBatch(std::string&& rep) : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), - rep_(std::move(rep)) {} + rep_(std::move(rep)), + timestamp_size_(0) {} WriteBatch::WriteBatch(const WriteBatch& src) : wal_term_point_(src.wal_term_point_), content_flags_(src.content_flags_.load(std::memory_order_relaxed)), max_bytes_(src.max_bytes_), - rep_(src.rep_) { + rep_(src.rep_), + timestamp_size_(src.timestamp_size_) { if (src.save_points_ != nullptr) { save_points_.reset(new SavePoints()); save_points_->stack = src.save_points_->stack; @@ -174,7 +287,8 @@ WriteBatch::WriteBatch(WriteBatch&& src) noexcept wal_term_point_(std::move(src.wal_term_point_)), content_flags_(src.content_flags_.load(std::memory_order_relaxed)), max_bytes_(src.max_bytes_), - rep_(std::move(src.rep_)) {} + rep_(std::move(src.rep_)), + timestamp_size_(src.timestamp_size_) {} WriteBatch& WriteBatch::operator=(const WriteBatch& src) { if (&src != this) { @@ -220,9 +334,7 @@ void WriteBatch::Clear() { wal_term_point_.clear(); } -int WriteBatch::Count() const { - return WriteBatchInternal::Count(this); -} +uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); } uint32_t WriteBatch::ComputeContentFlags() const { auto rv = content_flags_.load(std::memory_order_relaxed); @@ -400,19 +512,32 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag, } Status WriteBatch::Iterate(Handler* handler) const { - Slice input(rep_); - if (input.size() < WriteBatchInternal::kHeader) { + if (rep_.size() < WriteBatchInternal::kHeader) { return Status::Corruption("malformed WriteBatch (too small)"); } - input.remove_prefix(WriteBatchInternal::kHeader); + return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader, + rep_.size()); +} + +Status WriteBatchInternal::Iterate(const WriteBatch* wb, + WriteBatch::Handler* handler, size_t begin, + size_t end) { + if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) { + return Status::Corruption("Invalid start/end bounds for Iterate"); + } + assert(begin <= end); + Slice input(wb->rep_.data() + begin, static_cast(end - begin)); + bool whole_batch = + (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size()); + Slice key, value, blob, xid; // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as // the batch boundary symbols otherwise we would mis-count the number of // batches. We do that by checking whether the accumulated batch is empty // before seeing the next Noop. bool empty_batch = true; - int found = 0; + uint32_t found = 0; Status s; char tag = 0; uint32_t column_family = 0; // default @@ -436,7 +561,7 @@ Status WriteBatch::Iterate(Handler* handler) const { } } else { assert(s.IsTryAgain()); - assert(!last_was_try_again); // to detect infinite loop bugs + assert(!last_was_try_again); // to detect infinite loop bugs if (UNLIKELY(last_was_try_again)) { return Status::Corruption( "two consecutive TryAgain in WriteBatch handler; this is either a " @@ -449,7 +574,7 @@ Status WriteBatch::Iterate(Handler* handler) const { switch (tag) { case kTypeColumnFamilyValue: case kTypeValue: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_PUT)); s = handler->PutCF(column_family, key, value); if (LIKELY(s.ok())) { @@ -459,7 +584,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilyDeletion: case kTypeDeletion: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE)); s = handler->DeleteCF(column_family, key); if (LIKELY(s.ok())) { @@ -469,7 +594,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilySingleDeletion: case kTypeSingleDeletion: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE)); s = handler->SingleDeleteCF(column_family, key); if (LIKELY(s.ok())) { @@ -479,7 +604,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilyRangeDeletion: case kTypeRangeDeletion: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE)); s = handler->DeleteRangeCF(column_family, key, value); if (LIKELY(s.ok())) { @@ -489,7 +614,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilyMerge: case kTypeMerge: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE)); s = handler->MergeCF(column_family, key, value); if (LIKELY(s.ok())) { @@ -499,7 +624,7 @@ Status WriteBatch::Iterate(Handler* handler) const { break; case kTypeColumnFamilyBlobIndex: case kTypeBlobIndex: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX)); s = handler->PutBlobIndexCF(column_family, key, value); if (LIKELY(s.ok())) { @@ -512,7 +637,7 @@ Status WriteBatch::Iterate(Handler* handler) const { empty_batch = false; break; case kTypeBeginPrepareXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); handler->MarkBeginPrepare(); empty_batch = false; @@ -531,7 +656,7 @@ Status WriteBatch::Iterate(Handler* handler) const { } break; case kTypeBeginPersistedPrepareXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE)); handler->MarkBeginPrepare(); empty_batch = false; @@ -544,7 +669,7 @@ Status WriteBatch::Iterate(Handler* handler) const { } break; case kTypeBeginUnprepareXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE)); handler->MarkBeginPrepare(true /* unprepared */); empty_batch = false; @@ -563,19 +688,19 @@ Status WriteBatch::Iterate(Handler* handler) const { } break; case kTypeEndPrepareXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE)); handler->MarkEndPrepare(xid); empty_batch = true; break; case kTypeCommitXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT)); handler->MarkCommit(xid); empty_batch = true; break; case kTypeRollbackXID: - assert(content_flags_.load(std::memory_order_relaxed) & + assert(wb->content_flags_.load(std::memory_order_relaxed) & (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK)); handler->MarkRollback(xid); empty_batch = true; @@ -591,7 +716,8 @@ Status WriteBatch::Iterate(Handler* handler) const { if (!s.ok()) { return s; } - if (handler_continue && found != WriteBatchInternal::Count(this)) { + if (handler_continue && whole_batch && + found != WriteBatchInternal::Count(wb)) { return Status::Corruption("WriteBatch has wrong count"); } else { return Status::OK(); @@ -606,11 +732,11 @@ void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) { b->is_latest_persistent_state_ = true; } -int WriteBatchInternal::Count(const WriteBatch* b) { +uint32_t WriteBatchInternal::Count(const WriteBatch* b) { return DecodeFixed32(b->rep_.data() + 8); } -void WriteBatchInternal::SetCount(WriteBatch* b, int n) { +void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) { EncodeFixed32(&b->rep_[8], n); } @@ -643,7 +769,14 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); PutVarint32(&b->rep_, column_family_id); } - PutLengthPrefixedSlice(&b->rep_, key); + if (0 == b->timestamp_size_) { + PutLengthPrefixedSlice(&b->rep_, key); + } else { + PutVarint32(&b->rep_, + static_cast(key.size() + b->timestamp_size_)); + b->rep_.append(key.data(), key.size()); + b->rep_.append(b->timestamp_size_, '\0'); + } PutLengthPrefixedSlice(&b->rep_, value); b->content_flags_.store( b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, @@ -692,7 +825,11 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, b->rep_.push_back(static_cast(kTypeColumnFamilyValue)); PutVarint32(&b->rep_, column_family_id); } - PutLengthPrefixedSliceParts(&b->rep_, key); + if (0 == b->timestamp_size_) { + PutLengthPrefixedSliceParts(&b->rep_, key); + } else { + PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_); + } PutLengthPrefixedSliceParts(&b->rep_, value); b->content_flags_.store( b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT, @@ -1011,7 +1148,7 @@ Status WriteBatch::RollbackToSavePoint() { save_points_->stack.pop(); assert(savepoint.size <= rep_.size()); - assert(savepoint.count <= Count()); + assert(static_cast(savepoint.count) <= Count()); if (savepoint.size == rep_.size()) { // No changes to rollback @@ -1038,11 +1175,22 @@ Status WriteBatch::PopSavePoint() { return Status::OK(); } +Status WriteBatch::AssignTimestamp(const Slice& ts) { + TimestampAssigner ts_assigner(ts); + return Iterate(&ts_assigner); +} + +Status WriteBatch::AssignTimestamps(const std::vector& ts_list) { + TimestampAssigner ts_assigner(ts_list); + return Iterate(&ts_assigner); +} + class MemTableInserter : public WriteBatch::Handler { SequenceNumber sequence_; ColumnFamilyMemTables* const cf_mems_; FlushScheduler* const flush_scheduler_; + TrimHistoryScheduler* const trim_history_scheduler_; const bool ignore_missing_column_families_; const uint64_t recovering_log_number_; // log number that all Memtables inserted into should reference @@ -1076,6 +1224,22 @@ class MemTableInserter : public WriteBatch::Handler { DupDetector duplicate_detector_; bool dup_dectector_on_; + bool hint_per_batch_; + bool hint_created_; + // Hints for this batch + using HintMap = std::unordered_map; + using HintMapType = std::aligned_storage::type; + HintMapType hint_; + + HintMap& GetHintMap() { + assert(hint_per_batch_); + if (!hint_created_) { + new (&hint_) HintMap(); + hint_created_ = true; + } + return *reinterpret_cast(&hint_); + } + MemPostInfoMap& GetPostMap() { assert(concurrent_memtable_writes_); if(!post_info_created_) { @@ -1104,18 +1268,20 @@ class MemTableInserter : public WriteBatch::Handler { // cf_mems should not be shared with concurrent inserters MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families, uint64_t recovering_log_number, DB* db, bool concurrent_memtable_writes, bool* has_valid_writes = nullptr, bool seq_per_batch = false, - bool batch_per_txn = true) + bool batch_per_txn = true, bool hint_per_batch = false) : sequence_(_sequence), cf_mems_(cf_mems), flush_scheduler_(flush_scheduler), + trim_history_scheduler_(trim_history_scheduler), ignore_missing_column_families_(ignore_missing_column_families), recovering_log_number_(recovering_log_number), log_number_ref_(0), - db_(reinterpret_cast(db)), + db_(static_cast_with_check(db)), concurrent_memtable_writes_(concurrent_memtable_writes), post_info_created_(false), has_valid_writes_(has_valid_writes), @@ -1131,7 +1297,9 @@ class MemTableInserter : public WriteBatch::Handler { write_before_prepare_(!batch_per_txn), unprepared_batch_(false), duplicate_detector_(), - dup_dectector_on_(false) { + dup_dectector_on_(false), + hint_per_batch_(hint_per_batch), + hint_created_(false) { assert(cf_mems_); } @@ -1144,6 +1312,12 @@ class MemTableInserter : public WriteBatch::Handler { reinterpret_cast (&mem_post_info_map_)->~MemPostInfoMap(); } + if (hint_created_) { + for (auto iter : GetHintMap()) { + delete[] reinterpret_cast(iter.second); + } + reinterpret_cast(&hint_)->~HintMap(); + } delete rebuilding_trx_; } @@ -1253,7 +1427,8 @@ class MemTableInserter : public WriteBatch::Handler { if (!moptions->inplace_update_support) { bool mem_res = mem->Add(sequence_, value_type, key, value, - concurrent_memtable_writes_, get_post_process_info(mem)); + concurrent_memtable_writes_, get_post_process_info(mem), + hint_per_batch_ ? &GetHintMap()[mem] : nullptr); if (UNLIKELY(!mem_res)) { assert(seq_per_batch_); ret_status = Status::TryAgain("key+seq exists"); @@ -1336,7 +1511,8 @@ class MemTableInserter : public WriteBatch::Handler { MemTable* mem = cf_mems_->GetMemTable(); bool mem_res = mem->Add(sequence_, delete_type, key, value, - concurrent_memtable_writes_, get_post_process_info(mem)); + concurrent_memtable_writes_, get_post_process_info(mem), + hint_per_batch_ ? &GetHintMap()[mem] : nullptr); if (UNLIKELY(!mem_res)) { assert(seq_per_batch_); ret_status = Status::TryAgain("key+seq exists"); @@ -1471,7 +1647,6 @@ class MemTableInserter : public WriteBatch::Handler { Status MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) override { - assert(!concurrent_memtable_writes_); // optimize for non-recovery mode if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) { WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value); @@ -1498,6 +1673,8 @@ class MemTableInserter : public WriteBatch::Handler { MemTable* mem = cf_mems_->GetMemTable(); auto* moptions = mem->GetImmutableMemTableOptions(); bool perform_merge = false; + assert(!concurrent_memtable_writes_ || + moptions->max_successive_merges == 0); // If we pass DB through and options.max_successive_merges is hit // during recovery, Get() will be issued which will try to acquire @@ -1505,6 +1682,7 @@ class MemTableInserter : public WriteBatch::Handler { // So we disable merge in recovery if (moptions->max_successive_merges > 0 && db_ != nullptr && recovering_log_number_ == 0) { + assert(!concurrent_memtable_writes_); LookupKey lkey(key, sequence_); // Count the number of successive merges at the head @@ -1550,6 +1728,7 @@ class MemTableInserter : public WriteBatch::Handler { perform_merge = false; } else { // 3) Add value to memtable + assert(!concurrent_memtable_writes_); bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value); if (UNLIKELY(!mem_res)) { assert(seq_per_batch_); @@ -1562,7 +1741,9 @@ class MemTableInserter : public WriteBatch::Handler { if (!perform_merge) { // Add merge operator to memtable - bool mem_res = mem->Add(sequence_, kTypeMerge, key, value); + bool mem_res = + mem->Add(sequence_, kTypeMerge, key, value, + concurrent_memtable_writes_, get_post_process_info(mem)); if (UNLIKELY(!mem_res)) { assert(seq_per_batch_); ret_status = Status::TryAgain("key+seq exists"); @@ -1597,7 +1778,20 @@ class MemTableInserter : public WriteBatch::Handler { cfd->mem()->MarkFlushScheduled()) { // MarkFlushScheduled only returns true if we are the one that // should take action, so no need to dedup further - flush_scheduler_->ScheduleFlush(cfd); + flush_scheduler_->ScheduleWork(cfd); + } + } + // check if memtable_list size exceeds max_write_buffer_size_to_maintain + if (trim_history_scheduler_ != nullptr) { + auto* cfd = cf_mems_->current(); + assert(cfd != nullptr); + if (cfd->ioptions()->max_write_buffer_size_to_maintain > 0 && + cfd->mem()->ApproximateMemoryUsageFast() + + cfd->imm()->ApproximateMemoryUsageExcludingLast() >= + static_cast( + cfd->ioptions()->max_write_buffer_size_to_maintain) && + cfd->imm()->MarkTrimHistoryNeeded()) { + trim_history_scheduler_->ScheduleWork(cfd); } } } @@ -1757,12 +1951,14 @@ class MemTableInserter : public WriteBatch::Handler { Status WriteBatchInternal::InsertInto( WriteThread::WriteGroup& write_group, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db, bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) { MemTableInserter inserter( - sequence, memtables, flush_scheduler, ignore_missing_column_families, - recovery_log_number, db, concurrent_memtable_writes, - nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn); + sequence, memtables, flush_scheduler, trim_history_scheduler, + ignore_missing_column_families, recovery_log_number, db, + concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch, + batch_per_txn); for (auto w : write_group) { if (w->CallbackFailed()) { continue; @@ -1788,17 +1984,19 @@ Status WriteBatchInternal::InsertInto( Status WriteBatchInternal::InsertInto( WriteThread::Writer* writer, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families, uint64_t log_number, DB* db, bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt, - bool batch_per_txn) { + bool batch_per_txn, bool hint_per_batch) { #ifdef NDEBUG (void)batch_cnt; #endif assert(writer->ShouldWriteToMemtable()); MemTableInserter inserter( - sequence, memtables, flush_scheduler, ignore_missing_column_families, - log_number, db, concurrent_memtable_writes, nullptr /*has_valid_writes*/, - seq_per_batch, batch_per_txn); + sequence, memtables, flush_scheduler, trim_history_scheduler, + ignore_missing_column_families, log_number, db, + concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch, + batch_per_txn, hint_per_batch); SetSequence(writer->batch, sequence); inserter.set_log_number_ref(writer->log_ref); Status s = writer->batch->Iterate(&inserter); @@ -1812,11 +2010,13 @@ Status WriteBatchInternal::InsertInto( Status WriteBatchInternal::InsertInto( const WriteBatch* batch, ColumnFamilyMemTables* memtables, - FlushScheduler* flush_scheduler, bool ignore_missing_column_families, - uint64_t log_number, DB* db, bool concurrent_memtable_writes, - SequenceNumber* next_seq, bool* has_valid_writes, bool seq_per_batch, - bool batch_per_txn) { + FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, + bool ignore_missing_column_families, uint64_t log_number, DB* db, + bool concurrent_memtable_writes, SequenceNumber* next_seq, + bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) { MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler, + trim_history_scheduler, ignore_missing_column_families, log_number, db, concurrent_memtable_writes, has_valid_writes, seq_per_batch, batch_per_txn); diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index bae62bf0317..3810c672272 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -9,11 +9,13 @@ #pragma once #include +#include "db/flush_scheduler.h" +#include "db/trim_history_scheduler.h" #include "db/write_thread.h" -#include "rocksdb/types.h" -#include "rocksdb/write_batch.h" #include "rocksdb/db.h" #include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/write_batch.h" #include "util/autovector.h" namespace rocksdb { @@ -113,10 +115,10 @@ class WriteBatchInternal { static Status InsertNoop(WriteBatch* batch); // Return the number of entries in the batch. - static int Count(const WriteBatch* batch); + static uint32_t Count(const WriteBatch* batch); // Set the count for the number of entries in the batch. - static void SetCount(WriteBatch* batch, int n); + static void SetCount(WriteBatch* batch, uint32_t n); // Return the sequence number for the start of this batch. static SequenceNumber Sequence(const WriteBatch* batch); @@ -162,6 +164,7 @@ class WriteBatchInternal { static Status InsertInto( WriteThread::WriteGroup& write_group, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families = false, uint64_t log_number = 0, DB* db = nullptr, bool concurrent_memtable_writes = false, bool seq_per_batch = false, bool batch_per_txn = true); @@ -171,6 +174,7 @@ class WriteBatchInternal { static Status InsertInto( const WriteBatch* batch, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families = false, uint64_t log_number = 0, DB* db = nullptr, bool concurrent_memtable_writes = false, SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr, @@ -179,11 +183,13 @@ class WriteBatchInternal { static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, + TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families = false, uint64_t log_number = 0, DB* db = nullptr, bool concurrent_memtable_writes = false, bool seq_per_batch = false, size_t batch_cnt = 0, - bool batch_per_txn = true); + bool batch_per_txn = true, + bool hint_per_batch = false); static Status Append(WriteBatch* dst, const WriteBatch* src, const bool WAL_only = false); @@ -192,6 +198,10 @@ class WriteBatchInternal { // leftByteSize and a WriteBatch with ByteSize rightByteSize static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize); + // Iterate over [begin, end) range of a write batch + static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler, + size_t begin, size_t end); + // This write batch includes the latest state that should be persisted. Such // state meant to be used only during recovery. static void SetAsLastestPersistentState(WriteBatch* b); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 322bd8945b0..869cfa8cb72 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -18,8 +18,8 @@ #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" +#include "test_util/testharness.h" #include "util/string_util.h" -#include "util/testharness.h" namespace rocksdb { @@ -35,8 +35,9 @@ static std::string PrintContents(WriteBatch* b) { mem->Ref(); std::string state; ColumnFamilyMemTablesDefault cf_mems_default(mem); - Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr); - int count = 0; + Status s = + WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr); + uint32_t count = 0; int put_count = 0; int delete_count = 0; int single_delete_count = 0; @@ -131,8 +132,8 @@ class WriteBatchTest : public testing::Test {}; TEST_F(WriteBatchTest, Empty) { WriteBatch batch; ASSERT_EQ("", PrintContents(&batch)); - ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); - ASSERT_EQ(0, batch.Count()); + ASSERT_EQ(0u, WriteBatchInternal::Count(&batch)); + ASSERT_EQ(0u, batch.Count()); } TEST_F(WriteBatchTest, Multiple) { @@ -143,14 +144,14 @@ TEST_F(WriteBatchTest, Multiple) { batch.Put(Slice("baz"), Slice("boo")); WriteBatchInternal::SetSequence(&batch, 100); ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch)); - ASSERT_EQ(4, WriteBatchInternal::Count(&batch)); + ASSERT_EQ(4u, WriteBatchInternal::Count(&batch)); ASSERT_EQ( "Put(baz, boo)@103" "Delete(box)@101" "Put(foo, bar)@100" "DeleteRange(bar, foo)@102", PrintContents(&batch)); - ASSERT_EQ(4, batch.Count()); + ASSERT_EQ(4u, batch.Count()); } TEST_F(WriteBatchTest, Corruption) { @@ -173,19 +174,19 @@ TEST_F(WriteBatchTest, Append) { WriteBatchInternal::Append(&b1, &b2); ASSERT_EQ("", PrintContents(&b1)); - ASSERT_EQ(0, b1.Count()); + ASSERT_EQ(0u, b1.Count()); b2.Put("a", "va"); WriteBatchInternal::Append(&b1, &b2); ASSERT_EQ("Put(a, va)@200", PrintContents(&b1)); - ASSERT_EQ(1, b1.Count()); + ASSERT_EQ(1u, b1.Count()); b2.Clear(); b2.Put("b", "vb"); WriteBatchInternal::Append(&b1, &b2); ASSERT_EQ("Put(a, va)@200" "Put(b, vb)@201", PrintContents(&b1)); - ASSERT_EQ(2, b1.Count()); + ASSERT_EQ(2u, b1.Count()); b2.Delete("foo"); WriteBatchInternal::Append(&b1, &b2); ASSERT_EQ("Put(a, va)@200" @@ -193,7 +194,7 @@ TEST_F(WriteBatchTest, Append) { "Put(b, vb)@201" "Delete(foo)@203", PrintContents(&b1)); - ASSERT_EQ(4, b1.Count()); + ASSERT_EQ(4u, b1.Count()); b2.Clear(); b2.Put("c", "cc"); b2.Put("d", "dd"); @@ -208,29 +209,29 @@ TEST_F(WriteBatchTest, Append) { "Put(d, dd)@205" "Delete(foo)@203", PrintContents(&b1)); - ASSERT_EQ(6, b1.Count()); + ASSERT_EQ(6u, b1.Count()); ASSERT_EQ( "Put(c, cc)@0" "Put(d, dd)@1" "Put(e, ee)@2", PrintContents(&b2)); - ASSERT_EQ(3, b2.Count()); + ASSERT_EQ(3u, b2.Count()); } TEST_F(WriteBatchTest, SingleDeletion) { WriteBatch batch; WriteBatchInternal::SetSequence(&batch, 100); ASSERT_EQ("", PrintContents(&batch)); - ASSERT_EQ(0, batch.Count()); + ASSERT_EQ(0u, batch.Count()); batch.Put("a", "va"); ASSERT_EQ("Put(a, va)@100", PrintContents(&batch)); - ASSERT_EQ(1, batch.Count()); + ASSERT_EQ(1u, batch.Count()); batch.SingleDelete("a"); ASSERT_EQ( "SingleDelete(a)@101" "Put(a, va)@100", PrintContents(&batch)); - ASSERT_EQ(2, batch.Count()); + ASSERT_EQ(2u, batch.Count()); } namespace { @@ -316,7 +317,7 @@ namespace { TEST_F(WriteBatchTest, PutNotImplemented) { WriteBatch batch; batch.Put(Slice("k1"), Slice("v1")); - ASSERT_EQ(1, batch.Count()); + ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch)); WriteBatch::Handler handler; @@ -326,7 +327,7 @@ TEST_F(WriteBatchTest, PutNotImplemented) { TEST_F(WriteBatchTest, DeleteNotImplemented) { WriteBatch batch; batch.Delete(Slice("k2")); - ASSERT_EQ(1, batch.Count()); + ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Delete(k2)@0", PrintContents(&batch)); WriteBatch::Handler handler; @@ -336,7 +337,7 @@ TEST_F(WriteBatchTest, DeleteNotImplemented) { TEST_F(WriteBatchTest, SingleDeleteNotImplemented) { WriteBatch batch; batch.SingleDelete(Slice("k2")); - ASSERT_EQ(1, batch.Count()); + ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch)); WriteBatch::Handler handler; @@ -346,7 +347,7 @@ TEST_F(WriteBatchTest, SingleDeleteNotImplemented) { TEST_F(WriteBatchTest, MergeNotImplemented) { WriteBatch batch; batch.Merge(Slice("foo"), Slice("bar")); - ASSERT_EQ(1, batch.Count()); + ASSERT_EQ(1u, batch.Count()); ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch)); WriteBatch::Handler handler; @@ -363,7 +364,7 @@ TEST_F(WriteBatchTest, Blob) { batch.SingleDelete(Slice("k3")); batch.PutLogData(Slice("blob2")); batch.Merge(Slice("foo"), Slice("bar")); - ASSERT_EQ(6, batch.Count()); + ASSERT_EQ(6u, batch.Count()); ASSERT_EQ( "Merge(foo, bar)@5" "Put(k1, v1)@0" @@ -398,7 +399,7 @@ TEST_F(WriteBatchTest, PrepareCommit) { ASSERT_EQ(s, Status::NotFound()); WriteBatchInternal::MarkCommit(&batch, Slice("xid1")); WriteBatchInternal::MarkRollback(&batch, Slice("xid1")); - ASSERT_EQ(2, batch.Count()); + ASSERT_EQ(2u, batch.Count()); TestHandler handler; batch.Iterate(&handler); @@ -488,7 +489,7 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { batch.Put(raw, raw); } - ASSERT_EQ(2, batch.Count()); + ASSERT_EQ(2u, batch.Count()); struct NoopHandler : public WriteBatch::Handler { int num_seen = 0; @@ -599,7 +600,7 @@ TEST_F(WriteBatchTest, PutGatherSlices) { "Put(foo, bar)@100" "Put(keypart2part3, value)@102", PrintContents(&batch)); - ASSERT_EQ(3, batch.Count()); + ASSERT_EQ(3u, batch.Count()); } namespace { diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index cb880560efc..1ab97b04589 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -11,14 +11,14 @@ #include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/write_callback.h" +#include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/write_batch.h" -#include "port/port.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" #include "util/random.h" -#include "util/sync_point.h" -#include "util/testharness.h" using std::string; @@ -124,6 +124,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { {false, false, true, false, true}, }; + for (auto& unordered_write : {true, false}) { for (auto& seq_per_batch : {true, false}) { for (auto& two_queues : {true, false}) { for (auto& allow_parallel : {true, false}) { @@ -133,15 +134,22 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { for (auto& write_group : write_scenarios) { Options options; options.create_if_missing = true; + options.unordered_write = unordered_write; options.allow_concurrent_memtable_write = allow_parallel; options.enable_pipelined_write = enable_pipelined_write; options.two_write_queues = two_queues; + // Skip unsupported combinations if (options.enable_pipelined_write && seq_per_batch) { - // This combination is not supported continue; } if (options.enable_pipelined_write && options.two_write_queues) { - // This combination is not supported + continue; + } + if (options.unordered_write && + !options.allow_concurrent_memtable_write) { + continue; + } + if (options.unordered_write && options.enable_pipelined_write) { continue; } @@ -296,7 +304,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {} Status Callback(SequenceNumber last_seq, bool /*not used*/, - uint64_t) override { + uint64_t, size_t /*index*/, + size_t /*total*/) override { db_impl_->SetLastPublishedSequence(last_seq); return Status::OK(); } @@ -358,8 +367,9 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { } } } -} -} + } + } + } } TEST_F(WriteCallbackTest, WriteCallBackTest) { diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc index 55feb00a339..919c2c11808 100644 --- a/db/write_controller_test.cc +++ b/db/write_controller_test.cc @@ -8,7 +8,7 @@ #include "db/write_controller.h" #include "rocksdb/env.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/write_thread.cc b/db/write_thread.cc index 835992c8fce..1ded68fde3b 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -9,8 +9,8 @@ #include "db/column_family.h" #include "monitoring/perf_context_imp.h" #include "port/port.h" +#include "test_util/sync_point.h" #include "util/random.h" -#include "util/sync_point.h" namespace rocksdb { @@ -22,6 +22,8 @@ WriteThread::WriteThread(const ImmutableDBOptions& db_options) allow_concurrent_memtable_write_( db_options.allow_concurrent_memtable_write), enable_pipelined_write_(db_options.enable_pipelined_write), + max_write_batch_group_size_bytes( + db_options.max_write_batch_group_size_bytes), newest_writer_(nullptr), newest_memtable_writer_(nullptr), last_sequence_(0), @@ -406,9 +408,10 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader, // Allow the group to grow up to a maximum size, but if the // original write is small, limit the growth so we do not slow // down the small write too much. - size_t max_size = 1 << 20; - if (size <= (128 << 10)) { - max_size = size + (128 << 10); + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; } leader->write_group = write_group; @@ -485,9 +488,10 @@ void WriteThread::EnterAsMemTableWriter(Writer* leader, // Allow the group to grow up to a maximum size, but if the // original write is small, limit the growth so we do not slow // down the small write too much. - size_t max_size = 1 << 20; - if (size <= (128 << 10)) { - max_size = size + (128 << 10); + size_t max_size = max_write_batch_group_size_bytes; + const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8; + if (size <= min_batch_size_bytes) { + max_size = size + min_batch_size_bytes; } leader->write_group = write_group; diff --git a/db/write_thread.h b/db/write_thread.h index dc9c22ff87e..e1db970663b 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -360,6 +360,11 @@ class WriteThread { // Enable pipelined write to WAL and memtable. const bool enable_pipelined_write_; + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + const uint64_t max_write_batch_group_size_bytes; + // Points to the newest pending writer. Only leader can remove // elements, adding can be done lock-free by anybody. std::atomic newest_writer_; diff --git a/defs.bzl b/defs.bzl index f3e8339783e..83e9a579f9e 100644 --- a/defs.bzl +++ b/defs.bzl @@ -1,4 +1,7 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# +# defs.bzl - Definitions for Facebook-specific buck build integration +# in TARGETS load("@fbcode_macros//build_defs:cpp_binary.bzl", "cpp_binary") load("@fbcode_macros//build_defs:custom_unittest.bzl", "custom_unittest") @@ -8,9 +11,13 @@ def test_binary( test_cc, parallelism, rocksdb_arch_preprocessor_flags, + rocksdb_os_preprocessor_flags, rocksdb_compiler_flags, rocksdb_preprocessor_flags, - rocksdb_external_deps): + rocksdb_external_deps, + rocksdb_os_deps, + extra_deps, + extra_compiler_flags): TEST_RUNNER = native.package_name() + "/buckifier/rocks_test_runner.sh" ttype = "gtest" if parallelism == "parallel" else "simple" @@ -20,9 +27,11 @@ def test_binary( name = test_bin, srcs = [test_cc], arch_preprocessor_flags = rocksdb_arch_preprocessor_flags, - compiler_flags = rocksdb_compiler_flags, + os_preprocessor_flags = rocksdb_os_preprocessor_flags, + compiler_flags = rocksdb_compiler_flags + extra_compiler_flags, preprocessor_flags = rocksdb_preprocessor_flags, - deps = [":rocksdb_test_lib"], + deps = [":rocksdb_test_lib"] + extra_deps, + os_deps = rocksdb_os_deps, external_deps = rocksdb_external_deps, ) diff --git a/docs/.gitignore b/docs/.gitignore index e48dc98be89..3938549cbe6 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -6,4 +6,3 @@ _site .sass-cache *.psd *~ - diff --git a/docs/_posts/2019-08-15-unordered-write.markdown b/docs/_posts/2019-08-15-unordered-write.markdown new file mode 100644 index 00000000000..5f0eb2880a4 --- /dev/null +++ b/docs/_posts/2019-08-15-unordered-write.markdown @@ -0,0 +1,56 @@ +--- +title: Higher write throughput with `unordered_write` feature +layout: post +author: maysamyabandeh +category: blog +--- + +Since RocksDB 6.3, The `unordered_write=`true option together with WritePrepared transactions offers 34-42% higher write throughput compared to vanilla RocksDB. If the application can handle more relaxed ordering guarantees, the gain in throughput would increase to 63-131%. + +### Background + +Currently RocksDB API delivers the following powerful guarantees: +- Atomic reads: Either all of a write batch is visible to reads or none of it. +- Read-your-own writes: When a write thread returns to the user, a subsequent read by the same thread will be able to see its own writes. +- Immutable Snapshots: The reads visible to the snapshot are immutable in the sense that it will not be affected by any in-flight or future writes. + +### `unordered_write` + +The `unordered_write` feature, when turned on, relaxes the default guarantees of RocksDB. While it still gives read-your-own-write property, neither atomic reads nor the immutable snapshot properties are provided any longer. However, RocksDB users could still get read-your-own-write and immutable snapshots when using this feature in conjunction with TransactionDB configured with WritePrepared transactions and `two_write_queues`. You can read [here](https://github.com/facebook/rocksdb/wiki/unordered_write) to learn about the design of `unordered_write` and [here](https://github.com/facebook/rocksdb/wiki/WritePrepared-Transactions) to learn more about WritePrepared transactions. + +### How to use it? + +To get the same guarantees as vanilla RocksdB: + + DBOptions db_options; + db_options.unordered_write = true; + db_options.two_write_queues = true; + DB* db; + { + TransactionDBOptions txn_db_options; + txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED; + txn_db_options.skip_concurrency_control = true; + TransactionDB* txn_db; + TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db); + db = txn_db; + } + db->Write(...); + +To get relaxed guarantees: + + DBOptions db_options; + db_options.unordered_write = true; + DB* db; + DB::Open(db_options, kDBPath, &db); + db->Write(...); + +# Benchmarks + + TEST_TMPDIR=/dev/shm/ ~/db_bench --benchmarks=fillrandom --threads=32 --num=10000000 -max_write_buffer_number=16 --max_background_jobs=64 --batch_size=8 --writes=3000000 -level0_file_num_compaction_trigger=99999 --level0_slowdown_writes_trigger=99999 --level0_stop_writes_trigger=99999 -enable_pipelined_write=false -disable_auto_compactions --transaction_db=true --unordered_write=1 --disable_wal=0 + +Throughput with `unordered_write`=true and using WritePrepared transaction: +- WAL: +42% +- No-WAL: +34% +Throughput with `unordered_write`=true +- WAL: +63% +- NoWAL: +131% diff --git a/env/env.cc b/env/env.cc index dcf79fb7fe7..6aad4a53e81 100644 --- a/env/env.cc +++ b/env/env.cc @@ -10,11 +10,13 @@ #include "rocksdb/env.h" #include +#include "logging/env_logger.h" +#include "memory/arena.h" #include "options/db_options.h" #include "port/port.h" #include "port/sys_time.h" #include "rocksdb/options.h" -#include "util/arena.h" +#include "rocksdb/utilities/object_registry.h" #include "util/autovector.h" namespace rocksdb { @@ -22,6 +24,55 @@ namespace rocksdb { Env::~Env() { } +Status Env::NewLogger(const std::string& fname, + std::shared_ptr* result) { + return NewEnvLogger(fname, this, result); +} + +Status Env::LoadEnv(const std::string& value, Env** result) { + Env* env = *result; + Status s; +#ifndef ROCKSDB_LITE + s = ObjectRegistry::NewInstance()->NewStaticObject(value, &env); +#else + s = Status::NotSupported("Cannot load environment in LITE mode: ", value); +#endif + if (s.ok()) { + *result = env; + } + return s; +} + +Status Env::LoadEnv(const std::string& value, Env** result, + std::shared_ptr* guard) { + assert(result); + Status s; +#ifndef ROCKSDB_LITE + Env* env = nullptr; + std::unique_ptr uniq_guard; + std::string err_msg; + assert(guard != nullptr); + env = ObjectRegistry::NewInstance()->NewObject(value, &uniq_guard, + &err_msg); + if (!env) { + s = Status::NotFound(std::string("Cannot load ") + Env::Type() + ": " + + value); + env = Env::Default(); + } + if (s.ok() && uniq_guard) { + guard->reset(uniq_guard.release()); + *result = guard->get(); + } else { + *result = env; + } +#else + (void)result; + (void)guard; + s = Status::NotSupported("Cannot load environment in LITE mode: ", value); +#endif + return s; +} + std::string Env::PriorityToString(Env::Priority priority) { switch (priority) { case Env::Priority::BOTTOM: @@ -369,6 +420,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { options.writable_file_max_buffer_size; env_options->allow_fallocate = options.allow_fallocate; env_options->strict_bytes_per_sync = options.strict_bytes_per_sync; + options.env->SanitizeEnvOptions(env_options); } } @@ -422,5 +474,20 @@ EnvOptions::EnvOptions() { AssignEnvOptions(this, options); } +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr* result) { + EnvOptions options; + // TODO: Tune the buffer size. + options.writable_file_max_buffer_size = 1024 * 1024; + std::unique_ptr writable_file; + const auto status = env->NewWritableFile(fname, &writable_file, options); + if (!status.ok()) { + return status; + } + + *result = std::make_shared(std::move(writable_file), fname, + options, env); + return Status::OK(); +} } // namespace rocksdb diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 93764d945f9..c955bdb7141 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -11,8 +11,7 @@ #include "env/mock_env.h" #include "rocksdb/env.h" -#include "rocksdb/utilities/object_registry.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { @@ -104,13 +103,12 @@ namespace { // ValuesIn() will skip running tests when given an empty collection. std::vector GetCustomEnvs() { static Env* custom_env; - static std::unique_ptr custom_env_guard; static bool init = false; if (!init) { init = true; const char* uri = getenv("TEST_ENV_URI"); if (uri != nullptr) { - custom_env = NewCustomObject(uri, &custom_env_guard); + Env::LoadEnv(uri, &custom_env); } } diff --git a/env/env_encryption.cc b/env/env_encryption.cc index df1b0011a01..b7095c0f579 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -195,23 +195,26 @@ class EncryptedWritableFile : public WritableFileWrapper { EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, size_t prefixLength) : WritableFileWrapper(f), file_(f), stream_(s), prefixLength_(prefixLength) { } - Status Append(const Slice& data) override { + Status Append(const Slice& data) override { AlignedBuffer buf; Status status; - Slice dataToAppend(data); + Slice dataToAppend(data); if (data.size() > 0) { auto offset = file_->GetFileSize(); // size including prefix // Encrypt in cloned buffer buf.Alignment(GetRequiredBufferAlignment()); buf.AllocateNewBuffer(data.size()); + // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove + // so that the next two lines can be replaced with buf.Append(). memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); + buf.Size(data.size()); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); if (!status.ok()) { return status; } - dataToAppend = Slice(buf.BufferStart(), data.size()); + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); } - status = file_->Append(dataToAppend); + status = file_->Append(dataToAppend); if (!status.ok()) { return status; } @@ -221,18 +224,19 @@ class EncryptedWritableFile : public WritableFileWrapper { Status PositionedAppend(const Slice& data, uint64_t offset) override { AlignedBuffer buf; Status status; - Slice dataToAppend(data); + Slice dataToAppend(data); offset += prefixLength_; if (data.size() > 0) { // Encrypt in cloned buffer buf.Alignment(GetRequiredBufferAlignment()); buf.AllocateNewBuffer(data.size()); memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); + buf.Size(data.size()); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); if (!status.ok()) { return status; } - dataToAppend = Slice(buf.BufferStart(), data.size()); + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); } status = file_->PositionedAppend(dataToAppend, offset); if (!status.ok()) { @@ -325,18 +329,19 @@ class EncryptedRandomRWFile : public RandomRWFile { Status Write(uint64_t offset, const Slice& data) override { AlignedBuffer buf; Status status; - Slice dataToWrite(data); + Slice dataToWrite(data); offset += prefixLength_; if (data.size() > 0) { // Encrypt in cloned buffer buf.Alignment(GetRequiredBufferAlignment()); buf.AllocateNewBuffer(data.size()); memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); + buf.Size(data.size()); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); if (!status.ok()) { return status; } - dataToWrite = Slice(buf.BufferStart(), data.size()); + dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize()); } status = file_->Write(offset, dataToWrite); return status; @@ -393,13 +398,14 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Read prefix + // Read prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); status = underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart()); if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } // Create cipher stream std::unique_ptr stream; @@ -430,13 +436,14 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Read prefix + // Read prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart()); if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } // Create cipher stream std::unique_ptr stream; @@ -467,12 +474,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -513,12 +521,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -554,12 +563,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -609,11 +619,13 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } else { - // File is new, initialize & write prefix + // File is new, initialize & write prefix provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Write(0, prefixSlice); if (!status.ok()) { return status; @@ -630,7 +642,7 @@ class EncryptedEnv : public EnvWrapper { return Status::OK(); } - // Store in *result the attributes of the children of the specified directory. + // Store in *result the attributes of the children of the specified directory. // In case the implementation lists the directory prior to iterating the files // and files are concurrently deleted, the deleted files will be omitted from // result. @@ -670,8 +682,7 @@ class EncryptedEnv : public EnvWrapper { EncryptionProvider *provider_; }; - -// Returns an Env that encrypts data when stored on disk and decrypts data when +// Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) { return new EncryptedEnv(base_env, provider); @@ -694,14 +705,14 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t char *block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { - // We're not encrypting a full block. + // We're not encrypting a full block. // Copy data to blockBuffer if (!blockBuffer.get()) { // Allocate buffer blockBuffer = std::unique_ptr(new char[blockSize]); } block = blockBuffer.get(); - // Copy plain data to block buffer + // Copy plain data to block buffer memmove(block + blockOffset, data, n); } auto status = EncryptBlock(blockIndex, block, (char*)scratch.data()); @@ -734,21 +745,19 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t std::string scratch; AllocateScratch(scratch); - assert(fileOffset < dataSize); - // Decrypt individual blocks. while (1) { char *block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { - // We're not decrypting a full block. + // We're not decrypting a full block. // Copy data to blockBuffer if (!blockBuffer.get()) { // Allocate buffer blockBuffer = std::unique_ptr(new char[blockSize]); } block = blockBuffer.get(); - // Copy encrypted data to block buffer + // Copy encrypted data to block buffer memmove(block + blockOffset, data, n); } auto status = DecryptBlock(blockIndex, block, (char*)scratch.data()); @@ -807,7 +816,7 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra memmove(scratch, iv_.data(), blockSize); EncodeFixed64(scratch, blockIndex + initialCounter_); - // Encrypt nonce+counter + // Encrypt nonce+counter auto status = cipher_.Encrypt(scratch); if (!status.ok()) { return status; @@ -823,13 +832,13 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra // Decrypt a block of data at the given block index. // Length of data is equal to BlockSize(); Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scratch) { - // For CTR decryption & encryption are the same + // For CTR decryption & encryption are the same return EncryptBlock(blockIndex, data, scratch); } // GetPrefixLength returns the length of the prefix that is added to every file // and used for storing encryption options. -// For optimal performance, the prefix length should be a multiple of +// For optimal performance, the prefix length should be a multiple of // the page size. size_t CTREncryptionProvider::GetPrefixLength() { return defaultPrefixLength; @@ -844,7 +853,7 @@ static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t & iv = Slice(prefix + blockSize, blockSize); } -// CreateNewPrefix initialized an allocated block of prefix memory +// CreateNewPrefix initialized an allocated block of prefix memory // for a new file. Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, char* prefix, @@ -873,7 +882,7 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, return Status::OK(); } -// PopulateSecretPrefixPart initializes the data into a new prefix block +// PopulateSecretPrefixPart initializes the data into a new prefix block // in plain text. // Returns the amount of space (starting from the start of the prefix) // that has been initialized. @@ -908,7 +917,7 @@ Status CTREncryptionProvider::CreateCipherStream( return status; } - // Create cipher stream + // Create cipher stream return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, prefix, result); } diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc index 9d0354cced8..207f0815bc4 100644 --- a/env/env_hdfs.cc +++ b/env/env_hdfs.cc @@ -17,8 +17,8 @@ #include #include #include +#include "logging/logging.h" #include "rocksdb/status.h" -#include "util/logging.h" #include "util/string_util.h" #define HDFS_EXISTS 0 @@ -420,7 +420,7 @@ Status HdfsEnv::NewRandomAccessFile(const std::string& fname, // create a new file for writing Status HdfsEnv::NewWritableFile(const std::string& fname, std::unique_ptr* result, - const EnvOptions& /*options*/) { + const EnvOptions& options) { result->reset(); Status s; HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options); @@ -590,6 +590,11 @@ Status HdfsEnv::UnlockFile(FileLock* /*lock*/) { return Status::OK(); } Status HdfsEnv::NewLogger(const std::string& fname, std::shared_ptr* result) { + // EnvOptions is used exclusively for its `strict_bytes_per_sync` value. That + // option is only intended for WAL/flush/compaction writes, so turn it off in + // the logger. + EnvOptions options; + options.strict_bytes_per_sync = false; HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options); if (f == nullptr || !f->isValid()) { delete f; diff --git a/env/env_posix.cc b/env/env_posix.cc index 387c0279397..83e209bf1f8 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -7,8 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors #include +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +#include +#endif #include #include + #if defined(OS_LINUX) #include #endif @@ -33,6 +37,7 @@ // Get nano time includes #if defined(OS_LINUX) || defined(OS_FREEBSD) #elif defined(__MACH__) +#include #include #include #else @@ -43,18 +48,18 @@ #include #include "env/io_posix.h" -#include "env/posix_logger.h" +#include "logging/logging.h" +#include "logging/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/thread_status_updater.h" #include "port/port.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/compression_context_cache.h" -#include "util/logging.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" #include "util/thread_local.h" #include "util/threadpool_imp.h" @@ -69,6 +74,17 @@ #endif namespace rocksdb { +#if defined(OS_WIN) +static const std::string kSharedLibExt = ".dll"; +static const char kPathSeparator = ';'; +#else +static const char kPathSeparator = ':'; +#if defined(OS_MACOSX) +static const std::string kSharedLibExt = ".dylib"; +#else +static const std::string kSharedLibExt = ".so"; +#endif +#endif namespace { @@ -115,6 +131,33 @@ int cloexec_flags(int flags, const EnvOptions* options) { return flags; } +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +class PosixDynamicLibrary : public DynamicLibrary { + public: + PosixDynamicLibrary(const std::string& name, void* handle) + : name_(name), handle_(handle) {} + ~PosixDynamicLibrary() override { dlclose(handle_); } + + Status LoadSymbol(const std::string& sym_name, void** func) override { + assert(nullptr != func); + dlerror(); // Clear any old error + *func = dlsym(handle_, sym_name.c_str()); + if (*func != nullptr) { + return Status::OK(); + } else { + char* err = dlerror(); + return Status::NotFound("Error finding symbol: " + sym_name, err); + } + } + + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; + void* handle_; +}; +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + class PosixEnv : public Env { public: PosixEnv(); @@ -729,6 +772,62 @@ class PosixEnv : public Env { return result; } +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION + // Loads the named library into the result. + // If the input name is empty, the current executable is loaded + // On *nix systems, a "lib" prefix is added to the name if one is not supplied + // Comparably, the appropriate shared library extension is added to the name + // if not supplied. If search_path is not specified, the shared library will + // be loaded using the default path (LD_LIBRARY_PATH) If search_path is + // specified, the shared library will be searched for in the directories + // provided by the search path + Status LoadLibrary(const std::string& name, const std::string& path, + std::shared_ptr* result) override { + Status status; + assert(result != nullptr); + if (name.empty()) { + void* hndl = dlopen(NULL, RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(name, hndl)); + return Status::OK(); + } + } else { + std::string library_name = name; + if (library_name.find(kSharedLibExt) == std::string::npos) { + library_name = library_name + kSharedLibExt; + } +#if !defined(OS_WIN) + if (library_name.find('/') == std::string::npos && + library_name.compare(0, 3, "lib") != 0) { + library_name = "lib" + library_name; + } +#endif + if (path.empty()) { + void* hndl = dlopen(library_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(library_name, hndl)); + return Status::OK(); + } + } else { + std::string local_path; + std::stringstream ss(path); + while (getline(ss, local_path, kPathSeparator)) { + if (!path.empty()) { + std::string full_name = local_path + "/" + library_name; + void* hndl = dlopen(full_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(full_name, hndl)); + return Status::OK(); + } + } + } + } + } + return Status::IOError( + IOErrorMsg("Failed to open shared library: xs", name), dlerror()); + } +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW, void* tag = nullptr, void (*unschedFunction)(void* arg) = nullptr) override; @@ -789,13 +888,14 @@ class PosixEnv : public Env { FILE* f; { IOSTATS_TIMER_GUARD(open_nanos); - f = fopen(fname.c_str(), "w" + f = fopen(fname.c_str(), + "w" #ifdef __GLIBC_PREREQ #if __GLIBC_PREREQ(2, 7) - "e" // glibc extension to enable O_CLOEXEC + "e" // glibc extension to enable O_CLOEXEC #endif #endif - ); + ); } if (f == nullptr) { result->reset(); @@ -839,7 +939,7 @@ class PosixEnv : public Env { uint64_t NowCPUNanos() override { #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) || \ - defined(__MACH__) + (defined(__MACH__) && defined(__MAC_10_12)) struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; diff --git a/env/env_test.cc b/env/env_test.cc index 47800928499..004adf26e5a 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -11,13 +11,6 @@ #include #endif -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif #include #include @@ -38,15 +31,16 @@ #endif #include "env/env_chroot.h" +#include "logging/log_buffer.h" +#include "port/malloc.h" #include "port/port.h" #include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" -#include "util/log_buffer.h" #include "util/mutexlock.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" #ifdef OS_LINUX static const size_t kPageSize = sysconf(_SC_PAGESIZE); @@ -247,6 +241,52 @@ TEST_F(EnvPosixTest, MemoryMappedFileBuffer) { ASSERT_EQ(expected_data, actual_data); } +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +TEST_F(EnvPosixTest, LoadRocksDBLibrary) { + std::shared_ptr library; + std::function function; + Status status = env_->LoadLibrary("no-such-library", "", &library); + ASSERT_NOK(status); + ASSERT_EQ(nullptr, library.get()); + status = env_->LoadLibrary("rocksdb", "", &library); + if (status.ok()) { // If we have can find a rocksdb shared library + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(library->LoadFunction("rocksdb_create_default_env", + &function)); // from C definition + ASSERT_NE(nullptr, function); + ASSERT_NOK(library->LoadFunction("no-such-method", &function)); + ASSERT_EQ(nullptr, function); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } else { + ASSERT_EQ(nullptr, library.get()); + } +} +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + +#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION) +TEST_F(EnvPosixTest, LoadRocksDBLibraryWithSearchPath) { + std::shared_ptr library; + std::function function; + ASSERT_NOK(env_->LoadLibrary("no-such-library", "/tmp", &library)); + ASSERT_EQ(nullptr, library.get()); + ASSERT_NOK(env_->LoadLibrary("dl", "/tmp", &library)); + ASSERT_EQ(nullptr, library.get()); + Status status = env_->LoadLibrary("rocksdb", "/tmp:./", &library); + if (status.ok()) { + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } + char buff[1024]; + std::string cwd = getcwd(buff, sizeof(buff)); + + status = env_->LoadLibrary("rocksdb", "/tmp:" + cwd, &library); + if (status.ok()) { + ASSERT_NE(nullptr, library.get()); + ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library)); + } +} +#endif // !OS_WIN && !ROCKSDB_NO_DYNAMIC_EXTENSION + TEST_P(EnvPosixTestWithParam, UnSchedule) { std::atomic called(false); env_->SetBackgroundThreads(1, Env::LOW); @@ -843,7 +883,9 @@ TEST_F(EnvPosixTest, PositionedAppend) { } #endif // !ROCKSDB_LITE -// Only works in linux platforms +// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can +// handle a return value of zero but this test case cannot. +#ifndef OS_WIN TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) { // Create file. if (env_ == Env::Default()) { @@ -886,6 +928,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) { env_->DeleteFile(fname); } } +#endif // !defined(OS_WIN) // only works in linux platforms #ifdef ROCKSDB_FALLOCATE_PRESENT @@ -976,7 +1019,9 @@ bool HasPrefix(const std::unordered_set& ss) { return false; } -// Only works in linux and WIN platforms +// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can +// handle a return value of zero but this test case cannot. +#ifndef OS_WIN TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) { if (env_ == Env::Default()) { // Check whether a bunch of concurrently existing files have unique IDs. @@ -1018,7 +1063,6 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) { } } -// Only works in linux and WIN platforms TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) { if (env_ == Env::Default()) { EnvOptions soptions; @@ -1058,6 +1102,62 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) { ASSERT_TRUE(!HasPrefix(ids)); } } +#endif // !defined(OS_WIN) + +TEST_P(EnvPosixTestWithParam, MultiRead) { + EnvOptions soptions; + soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + const size_t kSectorSize = 4096; + const size_t kNumSectors = 8; + + // Create file. + { + std::unique_ptr wfile; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_writes) { + soptions.use_direct_writes = false; + } +#endif + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + for (size_t i = 0; i < kNumSectors; ++i) { + auto data = NewAligned(kSectorSize * 8, static_cast(i + 1)); + Slice slice(data.get(), kSectorSize); + ASSERT_OK(wfile->Append(slice)); + } + ASSERT_OK(wfile->Close()); + } + + // Random Read + { + std::unique_ptr file; + std::vector reqs(3); + std::vector> data; + uint64_t offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + reqs[i].offset = offset; + offset += 2 * kSectorSize; + reqs[i].len = kSectorSize; + data.emplace_back(NewAligned(kSectorSize, 0)); + reqs[i].scratch = data.back().get(); + } +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_reads) { + soptions.use_direct_reads = false; + } +#endif + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file->MultiRead(reqs.data(), reqs.size())); + for (size_t i = 0; i < reqs.size(); ++i) { + auto buf = NewAligned(kSectorSize * 8, static_cast(i * 2 + 1)); + ASSERT_OK(reqs[i].status); + ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0); + } + } +} // Only works in linux platforms #ifdef OS_WIN diff --git a/env/io_posix.cc b/env/io_posix.cc index 0f86c3ff93f..3572d7cc9a4 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -14,6 +14,9 @@ #include #if defined(OS_LINUX) #include +#ifndef FALLOC_FL_KEEP_SIZE +#include +#endif #endif #include #include @@ -27,17 +30,16 @@ #include #include #endif -#include "env/posix_logger.h" #include "monitoring/iostats_context_imp.h" #include "port/port.h" #include "rocksdb/slice.h" +#include "test_util/sync_point.h" #include "util/coding.h" #include "util/string_util.h" -#include "util/sync_point.h" #if defined(OS_LINUX) && !defined(F_SET_RW_HINT) #define F_LINUX_SPECIFIC_BASE 1024 -#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) #endif namespace rocksdb { @@ -58,6 +60,57 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) { namespace { +// On MacOS (and probably *BSD), the posix write and pwrite calls do not support +// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by +// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep +// the writes aligned. + +bool PosixWrite(int fd, const char* buf, size_t nbyte) { + const size_t kLimit1Gb = 1UL << 30; + + const char* src = buf; + size_t left = nbyte; + + while (left != 0) { + size_t bytes_to_write = std::min(left, kLimit1Gb); + + ssize_t done = write(fd, src, bytes_to_write); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + left -= done; + src += done; + } + return true; +} + +bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) { + const size_t kLimit1Gb = 1UL << 30; + + const char* src = buf; + size_t left = nbyte; + + while (left != 0) { + size_t bytes_to_write = std::min(left, kLimit1Gb); + + ssize_t done = pwrite(fd, src, bytes_to_write, offset); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + left -= done; + offset += done; + src += done; + } + + return true; +} + size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) { #ifdef OS_LINUX struct stat buf; @@ -135,28 +188,34 @@ size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) { #define ZFS_SUPER_MAGIC 0x2fc12fc1 #endif -bool IsSyncFileRangeSupported(int __attribute__((__unused__)) fd) { - // `fstatfs` is only available on Linux, but so is `sync_file_range`, so - // `defined(ROCKSDB_RANGESYNC_PRESENT)` should imply `defined(OS_LINUX)`. +bool IsSyncFileRangeSupported(int fd) { + // The approach taken in this function is to build a blacklist of cases where + // we know `sync_file_range` definitely will not work properly despite passing + // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or + // if any of the checks fail in unexpected ways, we allow `sync_file_range` to + // be used. This way should minimize risk of impacting existing use cases. struct statfs buf; int ret = fstatfs(fd, &buf); assert(ret == 0); - if (ret != 0) { - // We don't know whether the filesystem properly supports `sync_file_range`. - // Even if it doesn't, we don't know of any safety issue with trying to call - // it anyways. So, to preserve the same behavior as before this `fstatfs` - // check was introduced, we assume `sync_file_range` is usable. - return true; - } - if (buf.f_type == ZFS_SUPER_MAGIC) { + if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) { // Testing on ZFS showed the writeback did not happen asynchronously when // `sync_file_range` was called, even though it returned success. Avoid it // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`, // even though this'll incur extra I/O for metadata. return false; } - // No known problems with other filesystems' implementations of - // `sync_file_range`, so allow them to use it. + + ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */); + assert(!(ret == -1 && errno != ENOSYS)); + if (ret == -1 && errno == ENOSYS) { + // `sync_file_range` is not implemented on all platforms even if + // compile-time checks pass and a supported filesystem is in-use. For + // example, using ext4 on WSL (Windows Subsystem for Linux), + // `sync_file_range()` returns `ENOSYS` + // ("Function not implemented"). + return false; + } + // None of the cases on the blacklist matched, so allow `sync_file_range` use. return true; } @@ -180,7 +239,7 @@ bool IsSectorAligned(const void* ptr, size_t sector_size) { return uintptr_t(ptr) % sector_size == 0; } -} +} // namespace #endif /* @@ -747,14 +806,14 @@ Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) { #ifdef ROCKSDB_FALLOCATE_PRESENT Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) { - assert(offset <= std::numeric_limits::max()); - assert(len <= std::numeric_limits::max()); + assert(offset <= static_cast(std::numeric_limits::max())); + assert(len <= static_cast(std::numeric_limits::max())); TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); int alloc_status = 0; if (allow_fallocate_) { - alloc_status = fallocate( - fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, - static_cast(offset), static_cast(len)); + alloc_status = + fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); } if (alloc_status == 0) { return Status::OK(); @@ -801,19 +860,13 @@ Status PosixWritableFile::Append(const Slice& data) { assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); } const char* src = data.data(); - size_t left = data.size(); - while (left != 0) { - ssize_t done = write(fd_, src, left); - if (done < 0) { - if (errno == EINTR) { - continue; - } - return IOError("While appending to file", filename_, errno); - } - left -= done; - src += done; + size_t nbytes = data.size(); + + if (!PosixWrite(fd_, src, nbytes)) { + return IOError("While appending to file", filename_, errno); } - filesize_ += data.size(); + + filesize_ += nbytes; return Status::OK(); } @@ -823,23 +876,14 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); } - assert(offset <= std::numeric_limits::max()); + assert(offset <= static_cast(std::numeric_limits::max())); const char* src = data.data(); - size_t left = data.size(); - while (left != 0) { - ssize_t done = pwrite(fd_, src, left, static_cast(offset)); - if (done < 0) { - if (errno == EINTR) { - continue; - } - return IOError("While pwrite to file at offset " + ToString(offset), - filename_, errno); - } - left -= done; - offset += done; - src += done; + size_t nbytes = data.size(); + if (!PosixPositionedWrite(fd_, src, nbytes, static_cast(offset))) { + return IOError("While pwrite to file at offset " + ToString(offset), + filename_, errno); } - filesize_ = offset; + filesize_ = offset + nbytes; return Status::OK(); } @@ -891,8 +935,8 @@ Status PosixWritableFile::Close() { // If not, we should hack it with FALLOC_FL_PUNCH_HOLE if (result == 0 && (file_stats.st_size + file_stats.st_blksize - 1) / - file_stats.st_blksize != - file_stats.st_blocks / (file_stats.st_blksize / 512)) { + file_stats.st_blksize != + file_stats.st_blocks / (file_stats.st_blksize / 512)) { IOSTATS_TIMER_GUARD(allocate_nanos); if (allow_fallocate_) { fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, @@ -942,10 +986,10 @@ void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { } #else (void)hint; -#endif // ROCKSDB_VALGRIND_RUN +#endif // ROCKSDB_VALGRIND_RUN #else (void)hint; -#endif // OS_LINUX +#endif // OS_LINUX } Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { @@ -968,15 +1012,15 @@ Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { #ifdef ROCKSDB_FALLOCATE_PRESENT Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) { - assert(offset <= std::numeric_limits::max()); - assert(len <= std::numeric_limits::max()); + assert(offset <= static_cast(std::numeric_limits::max())); + assert(len <= static_cast(std::numeric_limits::max())); TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = 0; if (allow_fallocate_) { - alloc_status = fallocate( - fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, - static_cast(offset), static_cast(len)); + alloc_status = + fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); } if (alloc_status == 0) { return Status::OK(); @@ -990,8 +1034,8 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) { Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { #ifdef ROCKSDB_RANGESYNC_PRESENT - assert(offset <= std::numeric_limits::max()); - assert(nbytes <= std::numeric_limits::max()); + assert(offset <= static_cast(std::numeric_limits::max())); + assert(nbytes <= static_cast(std::numeric_limits::max())); if (sync_file_range_supported_) { int ret; if (strict_bytes_per_sync_) { @@ -1037,24 +1081,11 @@ PosixRandomRWFile::~PosixRandomRWFile() { Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) { const char* src = data.data(); - size_t left = data.size(); - while (left != 0) { - ssize_t done = pwrite(fd_, src, left, offset); - if (done < 0) { - // error while writing to file - if (errno == EINTR) { - // write was interrupted, try again. - continue; - } - return IOError( - "While write random read/write file at offset " + ToString(offset), - filename_, errno); - } - - // Wrote `done` bytes - left -= done; - offset += done; - src += done; + size_t nbytes = data.size(); + if (!PosixPositionedWrite(fd_, src, nbytes, static_cast(offset))) { + return IOError( + "While write random read/write file at offset " + ToString(offset), + filename_, errno); } return Status::OK(); diff --git a/env/mock_env.cc b/env/mock_env.cc index 793a0837ab8..6d3adc808ed 100644 --- a/env/mock_env.cc +++ b/env/mock_env.cc @@ -31,6 +31,9 @@ class MemFile { rnd_(static_cast( MurmurHash(fn.data(), static_cast(fn.size()), 0))), fsynced_bytes_(0) {} + // No copying allowed. + MemFile(const MemFile&) = delete; + void operator=(const MemFile&) = delete; void Ref() { MutexLock lock(&mutex_); @@ -154,10 +157,6 @@ class MemFile { // Private since only Unref() should be used to delete it. ~MemFile() { assert(refs_ == 0); } - // No copying allowed. - MemFile(const MemFile&); - void operator=(const MemFile&); - Env* env_; const std::string fn_; mutable port::Mutex mutex_; diff --git a/env/mock_env_test.cc b/env/mock_env_test.cc index 97c49b5f516..b21b953b568 100644 --- a/env/mock_env_test.cc +++ b/env/mock_env_test.cc @@ -10,7 +10,7 @@ #include #include "rocksdb/env.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc index b1c1d02ba25..921041a576e 100644 --- a/examples/multi_processes_example.cc +++ b/examples/multi_processes_example.cc @@ -14,8 +14,8 @@ // run for a while, tailing the logs of the primary. After process with primary // instance exits, this process will keep running until you hit 'CTRL+C'. -#include #include +#include #include #include #include diff --git a/examples/rocksdb_option_file_example.ini b/examples/rocksdb_option_file_example.ini index 351f1ed0107..dcbc9a308a8 100644 --- a/examples/rocksdb_option_file_example.ini +++ b/examples/rocksdb_option_file_example.ini @@ -104,7 +104,7 @@ compression=kSnappyCompression level0_file_num_compaction_trigger=4 purge_redundant_kvs_while_flush=true - max_write_buffer_number_to_maintain=0 + max_write_buffer_size_to_maintain=0 memtable_factory=SkipListFactory max_grandparent_overlap_factor=8 expanded_compaction_factor=25 diff --git a/examples/transaction_example.cc b/examples/transaction_example.cc index 7274cf7ec07..6d12651ada9 100644 --- a/examples/transaction_example.cc +++ b/examples/transaction_example.cc @@ -50,17 +50,33 @@ int main() { // Read a key OUTSIDE this transaction. Does not affect txn. s = txn_db->Get(read_options, "abc", &value); + assert(s.IsNotFound()); // Write a key OUTSIDE of this transaction. - // Does not affect txn since this is an unrelated key. If we wrote key 'abc' - // here, the transaction would fail to commit. + // Does not affect txn since this is an unrelated key. s = txn_db->Put(write_options, "xyz", "zzz"); + assert(s.ok()); + + // Write a key OUTSIDE of this transaction. + // Fail because the key conflicts with the key written in txn. + s = txn_db->Put(write_options, "abc", "def"); + assert(s.subcode() == Status::kLockTimeout); + + // Value for key "xyz" has been committed, can be read in txn. + s = txn->Get(read_options, "xyz", &value); + assert(s.ok()); + assert(value == "zzz"); // Commit transaction s = txn->Commit(); assert(s.ok()); delete txn; + // Value is committed, can be read now. + s = txn_db->Get(read_options, "abc", &value); + assert(s.ok()); + assert(value == "def"); + //////////////////////////////////////////////////////// // // "Repeatable Read" (Snapshot Isolation) Example diff --git a/util/delete_scheduler.cc b/file/delete_scheduler.cc similarity index 98% rename from util/delete_scheduler.cc rename to file/delete_scheduler.cc index f5ee2844896..b66956ca08c 100644 --- a/util/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -5,17 +5,17 @@ #ifndef ROCKSDB_LITE -#include "util/delete_scheduler.h" +#include "file/delete_scheduler.h" #include #include +#include "file/sst_file_manager_impl.h" +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/env.h" -#include "util/logging.h" +#include "test_util/sync_point.h" #include "util/mutexlock.h" -#include "util/sst_file_manager_impl.h" -#include "util/sync_point.h" namespace rocksdb { diff --git a/util/delete_scheduler.h b/file/delete_scheduler.h similarity index 100% rename from util/delete_scheduler.h rename to file/delete_scheduler.h diff --git a/util/delete_scheduler_test.cc b/file/delete_scheduler_test.cc similarity index 99% rename from util/delete_scheduler_test.cc rename to file/delete_scheduler_test.cc index 0d8e354b9c0..b6d4b903c16 100644 --- a/util/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -3,23 +3,19 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include +#include #include #include +#include "file/delete_scheduler.h" +#include "file/sst_file_manager_impl.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "util/delete_scheduler.h" -#include "util/sst_file_manager_impl.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" #ifndef ROCKSDB_LITE diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc new file mode 100644 index 00000000000..89f32c6ff0b --- /dev/null +++ b/file/file_prefetch_buffer.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/file_prefetch_buffer.h" + +#include +#include + +#include "file/random_access_file_reader.h" +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/rate_limiter.h" + +namespace rocksdb { +Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, + uint64_t offset, size_t n, + bool for_compaction) { + size_t alignment = reader->file()->GetRequiredBufferAlignment(); + size_t offset_ = static_cast(offset); + uint64_t rounddown_offset = Rounddown(offset_, alignment); + uint64_t roundup_end = Roundup(offset_ + n, alignment); + uint64_t roundup_len = roundup_end - rounddown_offset; + assert(roundup_len >= alignment); + assert(roundup_len % alignment == 0); + + // Check if requested bytes are in the existing buffer_. + // If all bytes exist -- return. + // If only a few bytes exist -- reuse them & read only what is really needed. + // This is typically the case of incremental reading of data. + // If no bytes exist in buffer -- full pread. + + Status s; + uint64_t chunk_offset_in_buffer = 0; + uint64_t chunk_len = 0; + bool copy_data_to_new_buffer = false; + if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ && + offset <= buffer_offset_ + buffer_.CurrentSize()) { + if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) { + // All requested bytes are already in the buffer. So no need to Read + // again. + return s; + } else { + // Only a few requested bytes are in the buffer. memmove those chunk of + // bytes to the beginning, and memcpy them back into the new buffer if a + // new buffer is created. + chunk_offset_in_buffer = + Rounddown(static_cast(offset - buffer_offset_), alignment); + chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer; + assert(chunk_offset_in_buffer % alignment == 0); + assert(chunk_len % alignment == 0); + assert(chunk_offset_in_buffer + chunk_len <= + buffer_offset_ + buffer_.CurrentSize()); + if (chunk_len > 0) { + copy_data_to_new_buffer = true; + } else { + // this reset is not necessary, but just to be safe. + chunk_offset_in_buffer = 0; + } + } + } + + // Create a new buffer only if current capacity is not sufficient, and memcopy + // bytes from old buffer if needed (i.e., if chunk_len is greater than 0). + if (buffer_.Capacity() < roundup_len) { + buffer_.Alignment(alignment); + buffer_.AllocateNewBuffer(static_cast(roundup_len), + copy_data_to_new_buffer, chunk_offset_in_buffer, + static_cast(chunk_len)); + } else if (chunk_len > 0) { + // New buffer not needed. But memmove bytes from tail to the beginning since + // chunk_len is greater than 0. + buffer_.RefitTail(static_cast(chunk_offset_in_buffer), + static_cast(chunk_len)); + } + + Slice result; + s = reader->Read(rounddown_offset + chunk_len, + static_cast(roundup_len - chunk_len), &result, + buffer_.BufferStart() + chunk_len, for_compaction); + if (s.ok()) { + buffer_offset_ = rounddown_offset; + buffer_.Size(static_cast(chunk_len) + result.size()); + } + return s; +} + +bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, + Slice* result, bool for_compaction) { + if (track_min_offset_ && offset < min_offset_read_) { + min_offset_read_ = static_cast(offset); + } + if (!enable_ || offset < buffer_offset_) { + return false; + } + + // If the buffer contains only a few of the requested bytes: + // If readahead is enabled: prefetch the remaining bytes + readadhead bytes + // and satisfy the request. + // If readahead is not enabled: return false. + if (offset + n > buffer_offset_ + buffer_.CurrentSize()) { + if (readahead_size_ > 0) { + assert(file_reader_ != nullptr); + assert(max_readahead_size_ >= readahead_size_); + Status s; + if (for_compaction) { + s = Prefetch(file_reader_, offset, std::max(n, readahead_size_), + for_compaction); + } else { + s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction); + } + if (!s.ok()) { + return false; + } + readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2); + } else { + return false; + } + } + + uint64_t offset_in_buffer = offset - buffer_offset_; + *result = Slice(buffer_.BufferStart() + offset_in_buffer, n); + return true; +} +} // namespace rocksdb diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h new file mode 100644 index 00000000000..c3cacf1020e --- /dev/null +++ b/file/file_prefetch_buffer.h @@ -0,0 +1,97 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include "file/random_access_file_reader.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "util/aligned_buffer.h" + +namespace rocksdb { + +// FilePrefetchBuffer is a smart buffer to store and read data from a file. +class FilePrefetchBuffer { + public: + // Constructor. + // + // All arguments are optional. + // file_reader : the file reader to use. Can be a nullptr. + // readahead_size : the initial readahead size. + // max_readahead_size : the maximum readahead size. + // If max_readahead_size > readahead_size, the readahead size will be + // doubled on every IO until max_readahead_size is hit. + // Typically this is set as a multiple of readahead_size. + // max_readahead_size should be greater than equal to readahead_size. + // enable : controls whether reading from the buffer is enabled. + // If false, TryReadFromCache() always return false, and we only take stats + // for the minimum offset if track_min_offset = true. + // track_min_offset : Track the minimum offset ever read and collect stats on + // it. Used for adaptable readahead of the file footer/metadata. + // + // Automatic readhead is enabled for a file if file_reader, readahead_size, + // and max_readahead_size are passed in. + // If file_reader is a nullptr, setting readadhead_size and max_readahead_size + // does not make any sense. So it does nothing. + // A user can construct a FilePrefetchBuffer without any arguments, but use + // `Prefetch` to load data into the buffer. + FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr, + size_t readadhead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false) + : buffer_offset_(0), + file_reader_(file_reader), + readahead_size_(readadhead_size), + max_readahead_size_(max_readahead_size), + min_offset_read_(port::kMaxSizet), + enable_(enable), + track_min_offset_(track_min_offset) {} + + // Load data into the buffer from a file. + // reader : the file reader. + // offset : the file offset to start reading from. + // n : the number of bytes to read. + // for_compaction : if prefetch is done for compaction read. + Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n, + bool for_compaction = false); + + // Tries returning the data for a file raed from this buffer, if that data is + // in the buffer. + // It handles tracking the minimum read offset if track_min_offset = true. + // It also does the exponential readahead when readadhead_size is set as part + // of the constructor. + // + // offset : the file offset. + // n : the number of bytes. + // result : output buffer to put the data into. + // for_compaction : if cache read is done for compaction read. + bool TryReadFromCache(uint64_t offset, size_t n, Slice* result, + bool for_compaction = false); + + // The minimum `offset` ever passed to TryReadFromCache(). This will nly be + // tracked if track_min_offset = true. + size_t min_offset_read() const { return min_offset_read_; } + + private: + AlignedBuffer buffer_; + uint64_t buffer_offset_; + RandomAccessFileReader* file_reader_; + size_t readahead_size_; + size_t max_readahead_size_; + // The minimum `offset` ever passed to TryReadFromCache(). + size_t min_offset_read_; + // if false, TryReadFromCache() always return false, and we only take stats + // for track_min_offset_ if track_min_offset_ = true + bool enable_; + // If true, track minimum `offset` ever passed to TryReadFromCache(), which + // can be fetched from min_offset_read(). + bool track_min_offset_; +}; +} // namespace rocksdb diff --git a/util/file_util.cc b/file/file_util.cc similarity index 79% rename from util/file_util.cc rename to file/file_util.cc index ba1b4744bbb..f1bf6596ba6 100644 --- a/util/file_util.cc +++ b/file/file_util.cc @@ -3,14 +3,16 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "util/file_util.h" +#include "file/file_util.h" #include #include +#include "file/random_access_file_reader.h" +#include "file/sequence_file_reader.h" +#include "file/sst_file_manager_impl.h" +#include "file/writable_file_writer.h" #include "rocksdb/env.h" -#include "util/sst_file_manager_impl.h" -#include "util/file_reader_writer.h" namespace rocksdb { @@ -88,12 +90,12 @@ Status CreateFile(Env* env, const std::string& destination, } Status DeleteDBFile(const ImmutableDBOptions* db_options, - const std::string& fname, const std::string& dir_to_sync, - const bool force_bg) { + const std::string& fname, const std::string& dir_to_sync, + const bool force_bg, const bool force_fg) { #ifndef ROCKSDB_LITE SstFileManagerImpl* sfm = static_cast(db_options->sst_file_manager.get()); - if (sfm) { + if (sfm && !force_fg) { return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg); } else { return db_options->env->DeleteFile(fname); @@ -101,10 +103,22 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options, #else (void)dir_to_sync; (void)force_bg; + (void)force_fg; // SstFileManager is not supported in ROCKSDB_LITE // Delete file immediately return db_options->env->DeleteFile(fname); #endif } +bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) { + bool same = false; + assert(!db_options->db_paths.empty()); + Status s = db_options->env->AreFilesSame(db_options->wal_dir, + db_options->db_paths[0].path, &same); + if (s.IsNotSupported()) { + same = db_options->wal_dir == db_options->db_paths[0].path; + } + return same; +} + } // namespace rocksdb diff --git a/util/file_util.h b/file/file_util.h similarity index 81% rename from util/file_util.h rename to file/file_util.h index c3b365c8bc3..75d6d7eb9fe 100644 --- a/util/file_util.h +++ b/file/file_util.h @@ -6,11 +6,11 @@ #pragma once #include +#include "file/filename.h" #include "options/db_options.h" #include "rocksdb/env.h" #include "rocksdb/status.h" #include "rocksdb/types.h" -#include "util/filename.h" namespace rocksdb { // use_fsync maps to options.use_fsync, which determines the way that @@ -24,7 +24,9 @@ extern Status CreateFile(Env* env, const std::string& destination, extern Status DeleteDBFile(const ImmutableDBOptions* db_options, const std::string& fname, - const std::string& path_to_sync, - const bool force_bg = false); + const std::string& path_to_sync, const bool force_bg, + const bool force_fg); + +extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options); } // namespace rocksdb diff --git a/util/filename.cc b/file/filename.cc similarity index 88% rename from util/filename.cc rename to file/filename.cc index 32289aecb4b..5a3fa290226 100644 --- a/util/filename.cc +++ b/file/filename.cc @@ -6,22 +6,18 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include "util/filename.h" -#include +#include "file/filename.h" +#include #include #include #include +#include "file/writable_file_writer.h" +#include "logging/logging.h" #include "rocksdb/env.h" -#include "util/file_reader_writer.h" -#include "util/logging.h" +#include "test_util/sync_point.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/sync_point.h" namespace rocksdb { @@ -61,13 +57,16 @@ static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) { return write_idx; } +static std::string MakeFileName(uint64_t number, const char* suffix) { + char buf[100]; + snprintf(buf, sizeof(buf), "%06llu.%s", + static_cast(number), suffix); + return buf; +} + static std::string MakeFileName(const std::string& name, uint64_t number, const char* suffix) { - char buf[100]; - snprintf(buf, sizeof(buf), "/%06llu.%s", - static_cast(number), - suffix); - return name + buf; + return name + "/" + MakeFileName(number, suffix); } std::string LogFileName(const std::string& name, uint64_t number) { @@ -75,6 +74,11 @@ std::string LogFileName(const std::string& name, uint64_t number) { return MakeFileName(name, number, "log"); } +std::string LogFileName(uint64_t number) { + assert(number > 0); + return MakeFileName(number, "log"); +} + std::string BlobFileName(const std::string& blobdirname, uint64_t number) { assert(number > 0); return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str()); @@ -99,6 +103,10 @@ std::string MakeTableFileName(const std::string& path, uint64_t number) { return MakeFileName(path, number, kRocksDbTFileExt.c_str()); } +std::string MakeTableFileName(uint64_t number) { + return MakeFileName(number, kRocksDbTFileExt.c_str()); +} + std::string Rocks2LevelTableFileName(const std::string& fullname) { assert(fullname.size() > kRocksDbTFileExt.size() + 1); if (fullname.size() <= kRocksDbTFileExt.size() + 1) { @@ -385,8 +393,14 @@ Status SetCurrentFile(Env* env, const std::string& dbname, return s; } -Status SetIdentityFile(Env* env, const std::string& dbname) { - std::string id = env->GenerateUniqueId(); +Status SetIdentityFile(Env* env, const std::string& dbname, + const std::string& db_id) { + std::string id; + if (db_id.empty()) { + id = env->GenerateUniqueId(); + } else { + id = db_id; + } assert(!id.empty()); // Reserve the filename dbname/000000.dbtmp for the temporary identity file std::string tmp = TempFileName(dbname, 0); @@ -407,4 +421,36 @@ Status SyncManifest(Env* env, const ImmutableDBOptions* db_options, return file->Sync(db_options->use_fsync); } +Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, + const std::string& dbname, std::string* parent_dir, + std::vector* info_log_list) { + assert(parent_dir != nullptr); + assert(info_log_list != nullptr); + uint64_t number = 0; + FileType type = kLogFile; + + if (!db_log_dir.empty()) { + *parent_dir = db_log_dir; + } else { + *parent_dir = dbname; + } + + InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname); + + std::vector file_names; + Status s = env->GetChildren(*parent_dir, &file_names); + + if (!s.ok()) { + return s; + } + + for (auto& f : file_names) { + if (ParseFileName(f, &number, info_log_prefix.prefix, &type) && + (type == kInfoLogFile)) { + info_log_list->push_back(f); + } + } + return Status::OK(); +} + } // namespace rocksdb diff --git a/util/filename.h b/file/filename.h similarity index 91% rename from util/filename.h rename to file/filename.h index eea6b1b02fd..ad19d389594 100644 --- a/util/filename.h +++ b/file/filename.h @@ -47,6 +47,8 @@ enum FileType { // "dbname". extern std::string LogFileName(const std::string& dbname, uint64_t number); +extern std::string LogFileName(uint64_t number); + extern std::string BlobFileName(const std::string& bdirname, uint64_t number); extern std::string BlobFileName(const std::string& dbname, @@ -63,6 +65,8 @@ extern std::string ArchivedLogFileName(const std::string& dbname, extern std::string MakeTableFileName(const std::string& name, uint64_t number); +extern std::string MakeTableFileName(uint64_t number); + // Return the name of sstable with LevelDB suffix // created from RocksDB sstable suffixed name extern std::string Rocks2LevelTableFileName(const std::string& fullname); @@ -163,10 +167,19 @@ extern Status SetCurrentFile(Env* env, const std::string& dbname, Directory* directory_to_fsync); // Make the IDENTITY file for the db -extern Status SetIdentityFile(Env* env, const std::string& dbname); +extern Status SetIdentityFile(Env* env, const std::string& dbname, + const std::string& db_id = {}); // Sync manifest file `file`. extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options, WritableFileWriter* file); +// Return list of file names of info logs in `file_names`. +// The list only contains file name. The parent directory name is stored +// in `parent_dir`. +// `db_log_dir` should be the one as in options.db_log_dir +extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir, + const std::string& dbname, + std::string* parent_dir, + std::vector* file_names); } // namespace rocksdb diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc new file mode 100644 index 00000000000..5b5a19ff862 --- /dev/null +++ b/file/random_access_file_reader.cc @@ -0,0 +1,188 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/random_access_file_reader.h" + +#include +#include + +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/rate_limiter.h" + +namespace rocksdb { +Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, + char* scratch, bool for_compaction) const { + Status s; + uint64_t elapsed = 0; + { + StopWatch sw(env_, stats_, hist_type_, + (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, + true /*delay_enabled*/); + auto prev_perf_level = GetPerfLevel(); + IOSTATS_TIMER_GUARD(read_nanos); + if (use_direct_io()) { +#ifndef ROCKSDB_LITE + size_t alignment = file_->GetRequiredBufferAlignment(); + size_t aligned_offset = + TruncateToPageBoundary(alignment, static_cast(offset)); + size_t offset_advance = static_cast(offset) - aligned_offset; + size_t read_size = + Roundup(static_cast(offset + n), alignment) - aligned_offset; + AlignedBuffer buf; + buf.Alignment(alignment); + buf.AllocateNewBuffer(read_size); + while (buf.CurrentSize() < read_size) { + size_t allowed; + if (for_compaction && rate_limiter_ != nullptr) { + allowed = rate_limiter_->RequestToken( + buf.Capacity() - buf.CurrentSize(), buf.Alignment(), + Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead); + } else { + assert(buf.CurrentSize() == 0); + allowed = read_size; + } + Slice tmp; + + FileOperationInfo::TimePoint start_ts; + uint64_t orig_offset = 0; + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + orig_offset = aligned_offset + buf.CurrentSize(); + } + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); + s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp, + buf.Destination()); + } + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts, + s); + } + + buf.Size(buf.CurrentSize() + tmp.size()); + if (!s.ok() || tmp.size() < allowed) { + break; + } + } + size_t res_len = 0; + if (s.ok() && offset_advance < buf.CurrentSize()) { + res_len = buf.Read(scratch, offset_advance, + std::min(buf.CurrentSize() - offset_advance, n)); + } + *result = Slice(scratch, res_len); +#endif // !ROCKSDB_LITE + } else { + size_t pos = 0; + const char* res_scratch = nullptr; + while (pos < n) { + size_t allowed; + if (for_compaction && rate_limiter_ != nullptr) { + if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { + sw.DelayStart(); + } + allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */, + Env::IOPriority::IO_LOW, stats_, + RateLimiter::OpType::kRead); + if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { + sw.DelayStop(); + } + } else { + allowed = n; + } + Slice tmp_result; + +#ifndef ROCKSDB_LITE + FileOperationInfo::TimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + } +#endif + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); + s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos); + } +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, + finish_ts, s); + } +#endif + + if (res_scratch == nullptr) { + // we can't simply use `scratch` because reads of mmap'd files return + // data in a different buffer. + res_scratch = tmp_result.data(); + } else { + // make sure chunks are inserted contiguously into `res_scratch`. + assert(tmp_result.data() == res_scratch + pos); + } + pos += tmp_result.size(); + if (!s.ok() || tmp_result.size() < allowed) { + break; + } + } + *result = Slice(res_scratch, s.ok() ? pos : 0); + } + IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); + SetPerfLevel(prev_perf_level); + } + if (stats_ != nullptr && file_read_hist_ != nullptr) { + file_read_hist_->Add(elapsed); + } + + return s; +} + +Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs, + size_t num_reqs) const { + Status s; + uint64_t elapsed = 0; + assert(!use_direct_io()); + { + StopWatch sw(env_, stats_, hist_type_, + (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, + true /*delay_enabled*/); + auto prev_perf_level = GetPerfLevel(); + IOSTATS_TIMER_GUARD(read_nanos); + +#ifndef ROCKSDB_LITE + FileOperationInfo::TimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + } +#endif // ROCKSDB_LITE + { + IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); + s = file_->MultiRead(read_reqs, num_reqs); + } + for (size_t i = 0; i < num_reqs; ++i) { +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(), + start_ts, finish_ts, read_reqs[i].status); + } +#endif // ROCKSDB_LITE + IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size()); + } + SetPerfLevel(prev_perf_level); + } + if (stats_ != nullptr && file_read_hist_ != nullptr) { + file_read_hist_->Add(elapsed); + } + + return s; +} +} // namespace rocksdb diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h new file mode 100644 index 00000000000..abbc71ff112 --- /dev/null +++ b/file/random_access_file_reader.h @@ -0,0 +1,120 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/rate_limiter.h" +#include "util/aligned_buffer.h" + +namespace rocksdb { + +class Statistics; +class HistogramImpl; + +// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is +// responsible for: +// - Handling Buffered and Direct reads appropriately. +// - Rate limiting compaction reads. +// - Notifying any interested listeners on the completion of a read. +// - Updating IO stats. +class RandomAccessFileReader { + private: +#ifndef ROCKSDB_LITE + void NotifyOnFileReadFinish(uint64_t offset, size_t length, + const FileOperationInfo::TimePoint& start_ts, + const FileOperationInfo::TimePoint& finish_ts, + const Status& status) const { + FileOperationInfo info(file_name_, start_ts, finish_ts); + info.offset = offset; + info.length = length; + info.status = status; + + for (auto& listener : listeners_) { + listener->OnFileReadFinish(info); + } + } +#endif // ROCKSDB_LITE + + bool ShouldNotifyListeners() const { return !listeners_.empty(); } + + std::unique_ptr file_; + std::string file_name_; + Env* env_; + Statistics* stats_; + uint32_t hist_type_; + HistogramImpl* file_read_hist_; + RateLimiter* rate_limiter_; + std::vector> listeners_; + + public: + explicit RandomAccessFileReader( + std::unique_ptr&& raf, std::string _file_name, + Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0, + HistogramImpl* file_read_hist = nullptr, + RateLimiter* rate_limiter = nullptr, + const std::vector>& listeners = {}) + : file_(std::move(raf)), + file_name_(std::move(_file_name)), + env_(env), + stats_(stats), + hist_type_(hist_type), + file_read_hist_(file_read_hist), + rate_limiter_(rate_limiter), + listeners_() { +#ifndef ROCKSDB_LITE + std::for_each(listeners.begin(), listeners.end(), + [this](const std::shared_ptr& e) { + if (e->ShouldBeNotifiedOnFileIO()) { + listeners_.emplace_back(e); + } + }); +#else // !ROCKSDB_LITE + (void)listeners; +#endif + } + + RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT { + *this = std::move(o); + } + + RandomAccessFileReader& operator=(RandomAccessFileReader&& o) + ROCKSDB_NOEXCEPT { + file_ = std::move(o.file_); + env_ = std::move(o.env_); + stats_ = std::move(o.stats_); + hist_type_ = std::move(o.hist_type_); + file_read_hist_ = std::move(o.file_read_hist_); + rate_limiter_ = std::move(o.rate_limiter_); + return *this; + } + + RandomAccessFileReader(const RandomAccessFileReader&) = delete; + RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete; + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch, + bool for_compaction = false) const; + + Status MultiRead(ReadRequest* reqs, size_t num_reqs) const; + + Status Prefetch(uint64_t offset, size_t n) const { + return file_->Prefetch(offset, n); + } + + RandomAccessFile* file() { return file_.get(); } + + std::string file_name() const { return file_name_; } + + bool use_direct_io() const { return file_->use_direct_io(); } +}; +} // namespace rocksdb diff --git a/file/read_write_util.cc b/file/read_write_util.cc new file mode 100644 index 00000000000..892499b8cfa --- /dev/null +++ b/file/read_write_util.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/read_write_util.h" + +#include +#include "test_util/sync_point.h" + +namespace rocksdb { +Status NewWritableFile(Env* env, const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + Status s = env->NewWritableFile(fname, result, options); + TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2); + return s; +} + +bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file, + std::string* output, bool* has_data, Status* result) { + const int kBufferSize = 8192; + char buffer[kBufferSize + 1]; + Slice input_slice; + + std::string line; + bool has_complete_line = false; + while (!has_complete_line) { + if (std::getline(*iss, line)) { + has_complete_line = !iss->eof(); + } else { + has_complete_line = false; + } + if (!has_complete_line) { + // if we're not sure whether we have a complete line, + // further read from the file. + if (*has_data) { + *result = seq_file->Read(kBufferSize, &input_slice, buffer); + } + if (input_slice.size() == 0) { + // meaning we have read all the data + *has_data = false; + break; + } else { + iss->str(line + input_slice.ToString()); + // reset the internal state of iss so that we can keep reading it. + iss->clear(); + *has_data = (input_slice.size() == kBufferSize); + continue; + } + } + } + *output = line; + return *has_data || has_complete_line; +} + +#ifndef NDEBUG +bool IsFileSectorAligned(const size_t off, size_t sector_size) { + return off % sector_size == 0; +} +#endif // NDEBUG +} // namespace rocksdb diff --git a/file/read_write_util.h b/file/read_write_util.h new file mode 100644 index 00000000000..be975e854ff --- /dev/null +++ b/file/read_write_util.h @@ -0,0 +1,32 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/env.h" + +namespace rocksdb { +// Returns a WritableFile. +// +// env : the Env. +// fname : the file name. +// result : output arg. A WritableFile based on `fname` returned. +// options : the Env Options. +extern Status NewWritableFile(Env* env, const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options); + +// Read a single line from a file. +bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file, + std::string* output, bool* has_data, Status* result); + +#ifndef NDEBUG +bool IsFileSectorAligned(const size_t off, size_t sector_size); +#endif // NDEBUG +} // namespace rocksdb diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc new file mode 100644 index 00000000000..dc005b900b6 --- /dev/null +++ b/file/readahead_raf.cc @@ -0,0 +1,162 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/readahead_raf.h" + +#include +#include +#include "file/read_write_util.h" +#include "util/aligned_buffer.h" +#include "util/rate_limiter.h" + +namespace rocksdb { +namespace { +class ReadaheadRandomAccessFile : public RandomAccessFile { + public: + ReadaheadRandomAccessFile(std::unique_ptr&& file, + size_t readahead_size) + : file_(std::move(file)), + alignment_(file_->GetRequiredBufferAlignment()), + readahead_size_(Roundup(readahead_size, alignment_)), + buffer_(), + buffer_offset_(0) { + buffer_.Alignment(alignment_); + buffer_.AllocateNewBuffer(readahead_size_); + } + + ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete; + + ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = + delete; + + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override { + // Read-ahead only make sense if we have some slack left after reading + if (n + alignment_ >= readahead_size_) { + return file_->Read(offset, n, result, scratch); + } + + std::unique_lock lk(lock_); + + size_t cached_len = 0; + // Check if there is a cache hit, meaning that [offset, offset + n) is + // either completely or partially in the buffer. If it's completely cached, + // including end of file case when offset + n is greater than EOF, then + // return. + if (TryReadFromCache(offset, n, &cached_len, scratch) && + (cached_len == n || buffer_.CurrentSize() < readahead_size_)) { + // We read exactly what we needed, or we hit end of file - return. + *result = Slice(scratch, cached_len); + return Status::OK(); + } + size_t advanced_offset = static_cast(offset + cached_len); + // In the case of cache hit advanced_offset is already aligned, means that + // chunk_offset equals to advanced_offset + size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset); + + Status s = ReadIntoBuffer(chunk_offset, readahead_size_); + if (s.ok()) { + // The data we need is now in cache, so we can safely read it + size_t remaining_len; + TryReadFromCache(advanced_offset, n - cached_len, &remaining_len, + scratch + cached_len); + *result = Slice(scratch, cached_len + remaining_len); + } + return s; + } + + Status Prefetch(uint64_t offset, size_t n) override { + if (n < readahead_size_) { + // Don't allow smaller prefetches than the configured `readahead_size_`. + // `Read()` assumes a smaller prefetch buffer indicates EOF was reached. + return Status::OK(); + } + + std::unique_lock lk(lock_); + + size_t offset_ = static_cast(offset); + size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_); + if (prefetch_offset == buffer_offset_) { + return Status::OK(); + } + return ReadIntoBuffer(prefetch_offset, + Roundup(offset_ + n, alignment_) - prefetch_offset); + } + + size_t GetUniqueId(char* id, size_t max_size) const override { + return file_->GetUniqueId(id, max_size); + } + + void Hint(AccessPattern pattern) override { file_->Hint(pattern); } + + Status InvalidateCache(size_t offset, size_t length) override { + std::unique_lock lk(lock_); + buffer_.Clear(); + return file_->InvalidateCache(offset, length); + } + + bool use_direct_io() const override { return file_->use_direct_io(); } + + private: + // Tries to read from buffer_ n bytes starting at offset. If anything was read + // from the cache, it sets cached_len to the number of bytes actually read, + // copies these number of bytes to scratch and returns true. + // If nothing was read sets cached_len to 0 and returns false. + bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len, + char* scratch) const { + if (offset < buffer_offset_ || + offset >= buffer_offset_ + buffer_.CurrentSize()) { + *cached_len = 0; + return false; + } + uint64_t offset_in_buffer = offset - buffer_offset_; + *cached_len = std::min( + buffer_.CurrentSize() - static_cast(offset_in_buffer), n); + memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len); + return true; + } + + // Reads into buffer_ the next n bytes from file_ starting at offset. + // Can actually read less if EOF was reached. + // Returns the status of the read operastion on the file. + Status ReadIntoBuffer(uint64_t offset, size_t n) const { + if (n > buffer_.Capacity()) { + n = buffer_.Capacity(); + } + assert(IsFileSectorAligned(offset, alignment_)); + assert(IsFileSectorAligned(n, alignment_)); + Slice result; + Status s = file_->Read(offset, n, &result, buffer_.BufferStart()); + if (s.ok()) { + buffer_offset_ = offset; + buffer_.Size(result.size()); + assert(result.size() == 0 || buffer_.BufferStart() == result.data()); + } + return s; + } + + const std::unique_ptr file_; + const size_t alignment_; + const size_t readahead_size_; + + mutable std::mutex lock_; + // The buffer storing the prefetched data + mutable AlignedBuffer buffer_; + // The offset in file_, corresponding to data stored in buffer_ + mutable uint64_t buffer_offset_; +}; +} // namespace + +std::unique_ptr NewReadaheadRandomAccessFile( + std::unique_ptr&& file, size_t readahead_size) { + std::unique_ptr result( + new ReadaheadRandomAccessFile(std::move(file), readahead_size)); + return result; +} +} // namespace rocksdb diff --git a/file/readahead_raf.h b/file/readahead_raf.h new file mode 100644 index 00000000000..f6d64e77ac5 --- /dev/null +++ b/file/readahead_raf.h @@ -0,0 +1,27 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include "rocksdb/env.h" + +namespace rocksdb { +// This file provides the following main abstractions: +// SequentialFileReader : wrapper over Env::SequentialFile +// RandomAccessFileReader : wrapper over Env::RandomAccessFile +// WritableFileWriter : wrapper over Env::WritableFile +// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile, +// and ReadOneLine primitives. + +// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to +// always prefetch additional data with every read. This is mainly used in +// Compaction Table Readers. +std::unique_ptr NewReadaheadRandomAccessFile( + std::unique_ptr&& file, size_t readahead_size); +} // namespace rocksdb diff --git a/file/sequence_file_reader.cc b/file/sequence_file_reader.cc new file mode 100644 index 00000000000..be766a68b38 --- /dev/null +++ b/file/sequence_file_reader.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/sequence_file_reader.h" + +#include +#include + +#include "file/read_write_util.h" +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" +#include "util/random.h" +#include "util/rate_limiter.h" + +namespace rocksdb { +Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { + Status s; + if (use_direct_io()) { +#ifndef ROCKSDB_LITE + size_t offset = offset_.fetch_add(n); + size_t alignment = file_->GetRequiredBufferAlignment(); + size_t aligned_offset = TruncateToPageBoundary(alignment, offset); + size_t offset_advance = offset - aligned_offset; + size_t size = Roundup(offset + n, alignment) - aligned_offset; + size_t r = 0; + AlignedBuffer buf; + buf.Alignment(alignment); + buf.AllocateNewBuffer(size); + Slice tmp; + s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart()); + if (s.ok() && offset_advance < tmp.size()) { + buf.Size(tmp.size()); + r = buf.Read(scratch, offset_advance, + std::min(tmp.size() - offset_advance, n)); + } + *result = Slice(scratch, r); +#endif // !ROCKSDB_LITE + } else { + s = file_->Read(n, result, scratch); + } + IOSTATS_ADD(bytes_read, result->size()); + return s; +} + +Status SequentialFileReader::Skip(uint64_t n) { +#ifndef ROCKSDB_LITE + if (use_direct_io()) { + offset_ += static_cast(n); + return Status::OK(); + } +#endif // !ROCKSDB_LITE + return file_->Skip(n); +} + +namespace { +// This class wraps a SequentialFile, exposing same API, with the differenece +// of being able to prefetch up to readahead_size bytes and then serve them +// from memory, avoiding the entire round-trip if, for example, the data for the +// file is actually remote. +class ReadaheadSequentialFile : public SequentialFile { + public: + ReadaheadSequentialFile(std::unique_ptr&& file, + size_t readahead_size) + : file_(std::move(file)), + alignment_(file_->GetRequiredBufferAlignment()), + readahead_size_(Roundup(readahead_size, alignment_)), + buffer_(), + buffer_offset_(0), + read_offset_(0) { + buffer_.Alignment(alignment_); + buffer_.AllocateNewBuffer(readahead_size_); + } + + ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete; + + ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete; + + Status Read(size_t n, Slice* result, char* scratch) override { + std::unique_lock lk(lock_); + + size_t cached_len = 0; + // Check if there is a cache hit, meaning that [offset, offset + n) is + // either completely or partially in the buffer. If it's completely cached, + // including end of file case when offset + n is greater than EOF, then + // return. + if (TryReadFromCache(n, &cached_len, scratch) && + (cached_len == n || buffer_.CurrentSize() < readahead_size_)) { + // We read exactly what we needed, or we hit end of file - return. + *result = Slice(scratch, cached_len); + return Status::OK(); + } + n -= cached_len; + + Status s; + // Read-ahead only make sense if we have some slack left after reading + if (n + alignment_ >= readahead_size_) { + s = file_->Read(n, result, scratch + cached_len); + if (s.ok()) { + read_offset_ += result->size(); + *result = Slice(scratch, cached_len + result->size()); + } + buffer_.Clear(); + return s; + } + + s = ReadIntoBuffer(readahead_size_); + if (s.ok()) { + // The data we need is now in cache, so we can safely read it + size_t remaining_len; + TryReadFromCache(n, &remaining_len, scratch + cached_len); + *result = Slice(scratch, cached_len + remaining_len); + } + return s; + } + + Status Skip(uint64_t n) override { + std::unique_lock lk(lock_); + Status s = Status::OK(); + // First check if we need to skip already cached data + if (buffer_.CurrentSize() > 0) { + // Do we need to skip beyond cached data? + if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) { + // Yes. Skip whaterver is in memory and adjust offset accordingly + n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_; + read_offset_ = buffer_offset_ + buffer_.CurrentSize(); + } else { + // No. The entire section to be skipped is entirely i cache. + read_offset_ += n; + n = 0; + } + } + if (n > 0) { + // We still need to skip more, so call the file API for skipping + s = file_->Skip(n); + if (s.ok()) { + read_offset_ += n; + } + buffer_.Clear(); + } + return s; + } + + Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override { + return file_->PositionedRead(offset, n, result, scratch); + } + + Status InvalidateCache(size_t offset, size_t length) override { + std::unique_lock lk(lock_); + buffer_.Clear(); + return file_->InvalidateCache(offset, length); + } + + bool use_direct_io() const override { return file_->use_direct_io(); } + + private: + // Tries to read from buffer_ n bytes. If anything was read from the cache, it + // sets cached_len to the number of bytes actually read, copies these number + // of bytes to scratch and returns true. + // If nothing was read sets cached_len to 0 and returns false. + bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) { + if (read_offset_ < buffer_offset_ || + read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) { + *cached_len = 0; + return false; + } + uint64_t offset_in_buffer = read_offset_ - buffer_offset_; + *cached_len = std::min( + buffer_.CurrentSize() - static_cast(offset_in_buffer), n); + memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len); + read_offset_ += *cached_len; + return true; + } + + // Reads into buffer_ the next n bytes from file_. + // Can actually read less if EOF was reached. + // Returns the status of the read operastion on the file. + Status ReadIntoBuffer(size_t n) { + if (n > buffer_.Capacity()) { + n = buffer_.Capacity(); + } + assert(IsFileSectorAligned(n, alignment_)); + Slice result; + Status s = file_->Read(n, &result, buffer_.BufferStart()); + if (s.ok()) { + buffer_offset_ = read_offset_; + buffer_.Size(result.size()); + assert(result.size() == 0 || buffer_.BufferStart() == result.data()); + } + return s; + } + + const std::unique_ptr file_; + const size_t alignment_; + const size_t readahead_size_; + + std::mutex lock_; + // The buffer storing the prefetched data + AlignedBuffer buffer_; + // The offset in file_, corresponding to data stored in buffer_ + uint64_t buffer_offset_; + // The offset up to which data was read from file_. In fact, it can be larger + // than the actual file size, since the file_->Skip(n) call doesn't return the + // actual number of bytes that were skipped, which can be less than n. + // This is not a problemm since read_offset_ is monotonically increasing and + // its only use is to figure out if next piece of data should be read from + // buffer_ or file_ directly. + uint64_t read_offset_; +}; +} // namespace + +std::unique_ptr +SequentialFileReader::NewReadaheadSequentialFile( + std::unique_ptr&& file, size_t readahead_size) { + if (file->GetRequiredBufferAlignment() >= readahead_size) { + // Short-circuit and return the original file if readahead_size is + // too small and hence doesn't make sense to be used for prefetching. + return std::move(file); + } + std::unique_ptr result( + new ReadaheadSequentialFile(std::move(file), readahead_size)); + return result; +} +} // namespace rocksdb diff --git a/file/sequence_file_reader.h b/file/sequence_file_reader.h new file mode 100644 index 00000000000..6a6350e1d69 --- /dev/null +++ b/file/sequence_file_reader.h @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "port/port.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles +// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page +// cache disabled) reads appropriately, and also updates the IO stats. +class SequentialFileReader { + private: + std::unique_ptr file_; + std::string file_name_; + std::atomic offset_{0}; // read offset + + public: + explicit SequentialFileReader(std::unique_ptr&& _file, + const std::string& _file_name) + : file_(std::move(_file)), file_name_(_file_name) {} + + explicit SequentialFileReader(std::unique_ptr&& _file, + const std::string& _file_name, + size_t _readahead_size) + : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)), + file_name_(_file_name) {} + + SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT { + *this = std::move(o); + } + + SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT { + file_ = std::move(o.file_); + return *this; + } + + SequentialFileReader(const SequentialFileReader&) = delete; + SequentialFileReader& operator=(const SequentialFileReader&) = delete; + + Status Read(size_t n, Slice* result, char* scratch); + + Status Skip(uint64_t n); + + SequentialFile* file() { return file_.get(); } + + std::string file_name() { return file_name_; } + + bool use_direct_io() const { return file_->use_direct_io(); } + + private: + // NewReadaheadSequentialFile provides a wrapper over SequentialFile to + // always prefetch additional data with every read. + static std::unique_ptr NewReadaheadSequentialFile( + std::unique_ptr&& file, size_t readahead_size); +}; +} // namespace rocksdb diff --git a/util/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc similarity index 98% rename from util/sst_file_manager_impl.cc rename to file/sst_file_manager_impl.cc index 6a770b106e8..08ea873258a 100644 --- a/util/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -3,21 +3,17 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/sst_file_manager_impl.h" +#include "file/sst_file_manager_impl.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/sst_file_manager.h" +#include "test_util/sync_point.h" #include "util/mutexlock.h" -#include "util/sync_point.h" namespace rocksdb { @@ -264,8 +260,11 @@ void SstFileManagerImpl::ClearError() { return; } - uint64_t free_space; + uint64_t free_space = 0; Status s = env_->GetFreeSpace(path_, &free_space); + free_space = max_allowed_space_ > 0 + ? std::min(max_allowed_space_, free_space) + : free_space; if (s.ok()) { // In case of multi-DB instances, some of them may have experienced a // soft error and some a hard error. In the SstFileManagerImpl, a hard diff --git a/util/sst_file_manager_impl.h b/file/sst_file_manager_impl.h similarity index 99% rename from util/sst_file_manager_impl.h rename to file/sst_file_manager_impl.h index 211b4fa7160..89304227807 100644 --- a/util/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -11,10 +11,10 @@ #include "port/port.h" -#include "db/compaction.h" +#include "db/compaction/compaction.h" #include "db/error_handler.h" +#include "file/delete_scheduler.h" #include "rocksdb/sst_file_manager.h" -#include "util/delete_scheduler.h" namespace rocksdb { diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc new file mode 100644 index 00000000000..277e55500c1 --- /dev/null +++ b/file/writable_file_writer.cc @@ -0,0 +1,405 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "file/writable_file_writer.h" + +#include +#include + +#include "monitoring/histogram.h" +#include "monitoring/iostats_context_imp.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/random.h" +#include "util/rate_limiter.h" + +namespace rocksdb { +Status WritableFileWriter::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + + TEST_KILL_RANDOM("WritableFileWriter::Append:0", + rocksdb_kill_odds * REDUCE_ODDS2); + + { + IOSTATS_TIMER_GUARD(prepare_write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite"); + writable_file_->PrepareWrite(static_cast(GetFileSize()), left); + } + + // See whether we need to enlarge the buffer to avoid the flush + if (buf_.Capacity() - buf_.CurrentSize() < left) { + for (size_t cap = buf_.Capacity(); + cap < max_buffer_size_; // There is still room to increase + cap *= 2) { + // See whether the next available size is large enough. + // Buffer will never be increased to more than max_buffer_size_. + size_t desired_capacity = std::min(cap * 2, max_buffer_size_); + if (desired_capacity - buf_.CurrentSize() >= left || + (use_direct_io() && desired_capacity == max_buffer_size_)) { + buf_.AllocateNewBuffer(desired_capacity, true); + break; + } + } + } + + // Flush only when buffered I/O + if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) { + if (buf_.CurrentSize() > 0) { + s = Flush(); + if (!s.ok()) { + return s; + } + } + assert(buf_.CurrentSize() == 0); + } + + // We never write directly to disk with direct I/O on. + // or we simply use it for its original purpose to accumulate many small + // chunks + if (use_direct_io() || (buf_.Capacity() >= left)) { + while (left > 0) { + size_t appended = buf_.Append(src, left); + left -= appended; + src += appended; + + if (left > 0) { + s = Flush(); + if (!s.ok()) { + break; + } + } + } + } else { + // Writing directly to file bypassing the buffer + assert(buf_.CurrentSize() == 0); + s = WriteBuffered(src, left); + } + + TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds); + if (s.ok()) { + filesize_ += data.size(); + } + return s; +} + +Status WritableFileWriter::Pad(const size_t pad_bytes) { + assert(pad_bytes < kDefaultPageSize); + size_t left = pad_bytes; + size_t cap = buf_.Capacity() - buf_.CurrentSize(); + + // Assume pad_bytes is small compared to buf_ capacity. So we always + // use buf_ rather than write directly to file in certain cases like + // Append() does. + while (left) { + size_t append_bytes = std::min(cap, left); + buf_.PadWith(append_bytes, 0); + left -= append_bytes; + if (left > 0) { + Status s = Flush(); + if (!s.ok()) { + return s; + } + } + cap = buf_.Capacity() - buf_.CurrentSize(); + } + pending_sync_ = true; + filesize_ += pad_bytes; + return Status::OK(); +} + +Status WritableFileWriter::Close() { + // Do not quit immediately on failure the file MUST be closed + Status s; + + // Possible to close it twice now as we MUST close + // in __dtor, simply flushing is not enough + // Windows when pre-allocating does not fill with zeros + // also with unbuffered access we also set the end of data. + if (!writable_file_) { + return s; + } + + s = Flush(); // flush cache to OS + + Status interim; + // In direct I/O mode we write whole pages so + // we need to let the file know where data ends. + if (use_direct_io()) { + interim = writable_file_->Truncate(filesize_); + if (interim.ok()) { + interim = writable_file_->Fsync(); + } + if (!interim.ok() && s.ok()) { + s = interim; + } + } + + TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds); + interim = writable_file_->Close(); + if (!interim.ok() && s.ok()) { + s = interim; + } + + writable_file_.reset(); + TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds); + + return s; +} + +// write out the cached data to the OS cache or storage if direct I/O +// enabled +Status WritableFileWriter::Flush() { + Status s; + TEST_KILL_RANDOM("WritableFileWriter::Flush:0", + rocksdb_kill_odds * REDUCE_ODDS2); + + if (buf_.CurrentSize() > 0) { + if (use_direct_io()) { +#ifndef ROCKSDB_LITE + if (pending_sync_) { + s = WriteDirect(); + } +#endif // !ROCKSDB_LITE + } else { + s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize()); + } + if (!s.ok()) { + return s; + } + } + + s = writable_file_->Flush(); + + if (!s.ok()) { + return s; + } + + // sync OS cache to disk for every bytes_per_sync_ + // TODO: give log file and sst file different options (log + // files could be potentially cached in OS for their whole + // life time, thus we might not want to flush at all). + + // We try to avoid sync to the last 1MB of data. For two reasons: + // (1) avoid rewrite the same page that is modified later. + // (2) for older version of OS, write can block while writing out + // the page. + // Xfs does neighbor page flushing outside of the specified ranges. We + // need to make sure sync range is far from the write offset. + if (!use_direct_io() && bytes_per_sync_) { + const uint64_t kBytesNotSyncRange = + 1024 * 1024; // recent 1MB is not synced. + const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB. + if (filesize_ > kBytesNotSyncRange) { + uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange; + offset_sync_to -= offset_sync_to % kBytesAlignWhenSync; + assert(offset_sync_to >= last_sync_size_); + if (offset_sync_to > 0 && + offset_sync_to - last_sync_size_ >= bytes_per_sync_) { + s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_); + last_sync_size_ = offset_sync_to; + } + } + } + + return s; +} + +Status WritableFileWriter::Sync(bool use_fsync) { + Status s = Flush(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds); + if (!use_direct_io() && pending_sync_) { + s = SyncInternal(use_fsync); + if (!s.ok()) { + return s; + } + } + TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds); + pending_sync_ = false; + return Status::OK(); +} + +Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) { + if (!writable_file_->IsSyncThreadSafe()) { + return Status::NotSupported( + "Can't WritableFileWriter::SyncWithoutFlush() because " + "WritableFile::IsSyncThreadSafe() is false"); + } + TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1"); + Status s = SyncInternal(use_fsync); + TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2"); + return s; +} + +Status WritableFileWriter::SyncInternal(bool use_fsync) { + Status s; + IOSTATS_TIMER_GUARD(fsync_nanos); + TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0"); + auto prev_perf_level = GetPerfLevel(); + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_); + if (use_fsync) { + s = writable_file_->Fsync(); + } else { + s = writable_file_->Sync(); + } + SetPerfLevel(prev_perf_level); + return s; +} + +Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { + IOSTATS_TIMER_GUARD(range_sync_nanos); + TEST_SYNC_POINT("WritableFileWriter::RangeSync:0"); + return writable_file_->RangeSync(offset, nbytes); +} + +// This method writes to disk the specified data and makes use of the rate +// limiter if available +Status WritableFileWriter::WriteBuffered(const char* data, size_t size) { + Status s; + assert(!use_direct_io()); + const char* src = data; + size_t left = size; + + while (left > 0) { + size_t allowed; + if (rate_limiter_ != nullptr) { + allowed = rate_limiter_->RequestToken( + left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_, + RateLimiter::OpType::kWrite); + } else { + allowed = left; + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + +#ifndef ROCKSDB_LITE + FileOperationInfo::TimePoint start_ts; + uint64_t old_size = writable_file_->GetFileSize(); + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + old_size = next_write_offset_; + } +#endif + { + auto prev_perf_level = GetPerfLevel(); + IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_); + s = writable_file_->Append(Slice(src, allowed)); + SetPerfLevel(prev_perf_level); + } +#ifndef ROCKSDB_LITE + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s); + } +#endif + if (!s.ok()) { + return s; + } + } + + IOSTATS_ADD(bytes_written, allowed); + TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds); + + left -= allowed; + src += allowed; + } + buf_.Size(0); + return s; +} + +// This flushes the accumulated data in the buffer. We pad data with zeros if +// necessary to the whole page. +// However, during automatic flushes padding would not be necessary. +// We always use RateLimiter if available. We move (Refit) any buffer bytes +// that are left over the +// whole number of pages to be written again on the next flush because we can +// only write on aligned +// offsets. +#ifndef ROCKSDB_LITE +Status WritableFileWriter::WriteDirect() { + assert(use_direct_io()); + Status s; + const size_t alignment = buf_.Alignment(); + assert((next_write_offset_ % alignment) == 0); + + // Calculate whole page final file advance if all writes succeed + size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize()); + + // Calculate the leftover tail, we write it here padded with zeros BUT we + // will write + // it again in the future either on Close() OR when the current whole page + // fills out + size_t leftover_tail = buf_.CurrentSize() - file_advance; + + // Round up and pad + buf_.PadToAlignmentWith(0); + + const char* src = buf_.BufferStart(); + uint64_t write_offset = next_write_offset_; + size_t left = buf_.CurrentSize(); + + while (left > 0) { + // Check how much is allowed + size_t size; + if (rate_limiter_ != nullptr) { + size = rate_limiter_->RequestToken(left, buf_.Alignment(), + writable_file_->GetIOPriority(), + stats_, RateLimiter::OpType::kWrite); + } else { + size = left; + } + + { + IOSTATS_TIMER_GUARD(write_nanos); + TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); + FileOperationInfo::TimePoint start_ts; + if (ShouldNotifyListeners()) { + start_ts = std::chrono::system_clock::now(); + } + // direct writes must be positional + s = writable_file_->PositionedAppend(Slice(src, size), write_offset); + if (ShouldNotifyListeners()) { + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s); + } + if (!s.ok()) { + buf_.Size(file_advance + leftover_tail); + return s; + } + } + + IOSTATS_ADD(bytes_written, size); + left -= size; + src += size; + write_offset += size; + assert((next_write_offset_ % alignment) == 0); + } + + if (s.ok()) { + // Move the tail to the beginning of the buffer + // This never happens during normal Append but rather during + // explicit call to Flush()/Sync() or Close() + buf_.RefitTail(file_advance, leftover_tail); + // This is where we start writing next time which may or not be + // the actual file size on disk. They match if the buffer size + // is a multiple of whole pages otherwise filesize_ is leftover_tail + // behind + next_write_offset_ += file_advance; + } + return s; +} +#endif // !ROCKSDB_LITE +} // namespace rocksdb diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h new file mode 100644 index 00000000000..09d612233af --- /dev/null +++ b/file/writable_file_writer.h @@ -0,0 +1,155 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/rate_limiter.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" + +namespace rocksdb { + +class Statistics; + +// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides +// facilities to: +// - Handle Buffered and Direct writes. +// - Rate limit writes. +// - Flush and Sync the data to the underlying filesystem. +// - Notify any interested listeners on the completion of a write. +// - Update IO stats. +class WritableFileWriter { + private: +#ifndef ROCKSDB_LITE + void NotifyOnFileWriteFinish(uint64_t offset, size_t length, + const FileOperationInfo::TimePoint& start_ts, + const FileOperationInfo::TimePoint& finish_ts, + const Status& status) { + FileOperationInfo info(file_name_, start_ts, finish_ts); + info.offset = offset; + info.length = length; + info.status = status; + + for (auto& listener : listeners_) { + listener->OnFileWriteFinish(info); + } + } +#endif // ROCKSDB_LITE + + bool ShouldNotifyListeners() const { return !listeners_.empty(); } + + std::unique_ptr writable_file_; + std::string file_name_; + Env* env_; + AlignedBuffer buf_; + size_t max_buffer_size_; + // Actually written data size can be used for truncate + // not counting padding data + uint64_t filesize_; +#ifndef ROCKSDB_LITE + // This is necessary when we use unbuffered access + // and writes must happen on aligned offsets + // so we need to go back and write that page again + uint64_t next_write_offset_; +#endif // ROCKSDB_LITE + bool pending_sync_; + uint64_t last_sync_size_; + uint64_t bytes_per_sync_; + RateLimiter* rate_limiter_; + Statistics* stats_; + std::vector> listeners_; + + public: + WritableFileWriter( + std::unique_ptr&& file, const std::string& _file_name, + const EnvOptions& options, Env* env = nullptr, + Statistics* stats = nullptr, + const std::vector>& listeners = {}) + : writable_file_(std::move(file)), + file_name_(_file_name), + env_(env), + buf_(), + max_buffer_size_(options.writable_file_max_buffer_size), + filesize_(0), +#ifndef ROCKSDB_LITE + next_write_offset_(0), +#endif // ROCKSDB_LITE + pending_sync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync), + rate_limiter_(options.rate_limiter), + stats_(stats), + listeners_() { + TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0", + reinterpret_cast(max_buffer_size_)); + buf_.Alignment(writable_file_->GetRequiredBufferAlignment()); + buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_)); +#ifndef ROCKSDB_LITE + std::for_each(listeners.begin(), listeners.end(), + [this](const std::shared_ptr& e) { + if (e->ShouldBeNotifiedOnFileIO()) { + listeners_.emplace_back(e); + } + }); +#else // !ROCKSDB_LITE + (void)listeners; +#endif + } + + WritableFileWriter(const WritableFileWriter&) = delete; + + WritableFileWriter& operator=(const WritableFileWriter&) = delete; + + ~WritableFileWriter() { Close(); } + + std::string file_name() const { return file_name_; } + + Status Append(const Slice& data); + + Status Pad(const size_t pad_bytes); + + Status Flush(); + + Status Close(); + + Status Sync(bool use_fsync); + + // Sync only the data that was already Flush()ed. Safe to call concurrently + // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(), + // returns NotSupported status. + Status SyncWithoutFlush(bool use_fsync); + + uint64_t GetFileSize() const { return filesize_; } + + Status InvalidateCache(size_t offset, size_t length) { + return writable_file_->InvalidateCache(offset, length); + } + + WritableFile* writable_file() const { return writable_file_.get(); } + + bool use_direct_io() { return writable_file_->use_direct_io(); } + + bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; } + + private: + // Used when os buffering is OFF and we are writing + // DMA such as in Direct I/O mode +#ifndef ROCKSDB_LITE + Status WriteDirect(); +#endif // !ROCKSDB_LITE + // Normal write + Status WriteBuffered(const char* data, size_t size); + Status RangeSync(uint64_t offset, uint64_t nbytes); + Status SyncInternal(bool use_fsync); +}; +} // namespace rocksdb diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index c88a6c17df2..d4e986a110a 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -175,11 +175,26 @@ struct AdvancedColumnFamilyOptions { // individual write buffers. Default: 1 int min_write_buffer_number_to_merge = 1; + // DEPRECATED // The total maximum number of write buffers to maintain in memory including // copies of buffers that have already been flushed. Unlike // max_write_buffer_number, this parameter does not affect flushing. - // This controls the minimum amount of write history that will be available - // in memory for conflict checking when Transactions are used. + // This parameter is being replaced by max_write_buffer_size_to_maintain. + // If both parameters are set to non-zero values, this parameter will be + // ignored. + int max_write_buffer_number_to_maintain = 0; + + // The total maximum size(bytes) of write buffers to maintain in memory + // including copies of buffers that have already been flushed. This parameter + // only affects trimming of flushed buffers and does not affect flushing. + // This controls the maximum amount of write history that will be available + // in memory for conflict checking when Transactions are used. The actual + // size of write history (flushed Memtables) might be higher than this limit + // if further trimming will reduce write history total size below this + // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB, + // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB. + // Because trimming the next Memtable of size 20MB will reduce total memory + // usage to 52MB which is below the limit, RocksDB will stop trimming. // // When using an OptimisticTransactionDB: // If this value is too low, some transactions may fail at commit time due @@ -192,14 +207,14 @@ struct AdvancedColumnFamilyOptions { // done for conflict detection. // // Setting this value to 0 will cause write buffers to be freed immediately - // after they are flushed. - // If this value is set to -1, 'max_write_buffer_number' will be used. + // after they are flushed. If this value is set to -1, + // 'max_write_buffer_number * write_buffer_size' will be used. // // Default: // If using a TransactionDB/OptimisticTransactionDB, the default value will - // be set to the value of 'max_write_buffer_number' if it is not explicitly - // set by the user. Otherwise, the default is 0. - int max_write_buffer_number_to_maintain = 0; + // be set to the value of 'max_write_buffer_number * write_buffer_size' + // if it is not explicitly set by the user. Otherwise, the default is 0. + int64_t max_write_buffer_size_to_maintain = 0; // Allows thread-safe inplace updates. If this is true, there is no way to // achieve point-in-time consistency using snapshot or iterator (assuming @@ -632,17 +647,22 @@ struct AdvancedColumnFamilyOptions { bool report_bg_io_stats = false; // Files older than TTL will go through the compaction process. - // Supported in Level and FIFO compaction. // Pre-req: This needs max_open_files to be set to -1. // In Level: Non-bottom-level files older than TTL will go through the // compation process. // In FIFO: Files older than TTL will be deleted. // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 + // In FIFO, this option will have the same meaning as + // periodic_compaction_seconds. Whichever stricter will be used. + // 0 means disabling. + // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to + // pick default. // - // Default: 0 (disabled) + // Default: 30 days for leveled compaction + block based table. disable + // otherwise. // // Dynamically changeable through SetOptions() API - uint64_t ttl = 0; + uint64_t ttl = 0xfffffffffffffffe; // Files older than this value will be picked up for compaction, and // re-written to the same level as they were before. @@ -652,13 +672,26 @@ struct AdvancedColumnFamilyOptions { // age is based on the file's last modified time (given by the underlying // Env). // - // Only supported in Level compaction. + // Supported in Level and FIFO compaction. + // In FIFO compaction, this option has the same meaning as TTL and whichever + // stricter will be used. // Pre-req: max_open_file == -1. // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 - // Default: 0 (disabled) + // + // Values: + // 0: Turn off Periodic compactions. + // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature + // as needed. For now, RocksDB will change this value to 30 days + // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction + // process at least once every 30 days if not compacted sooner. + // In FIFO compaction, since the option has the same meaning as ttl, + // when this value is left default, and ttl is left to 0, 30 days will be + // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. + // + // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune) // // Dynamically changeable through SetOptions() API - uint64_t periodic_compaction_seconds = 0; + uint64_t periodic_compaction_seconds = 0xfffffffffffffffe; // If this option is set then 1 in N blocks are compressed // using a fast (lz4) and slow (zstd) compression algorithm. diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index a0ae7ca7785..ba54085080a 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -138,6 +138,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only( const rocksdb_options_t* options, const char* name, unsigned char error_if_log_file_exist, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, char** errptr); + extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( const rocksdb_options_t* options, const char* path, char** errptr); @@ -218,6 +222,13 @@ rocksdb_open_for_read_only_column_families( rocksdb_column_family_handle_t** column_family_handles, unsigned char error_if_log_file_exist, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families( + const rocksdb_options_t* options, const char* name, + const char* secondary_path, int num_column_families, + const char** column_family_names, + const rocksdb_options_t** column_family_options, + rocksdb_column_family_handle_t** colummn_family_handles, char** errptr); + extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families( const rocksdb_options_t* options, const char* name, size_t* lencf, char** errptr); @@ -816,8 +827,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier( rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base( rocksdb_options_t*, uint64_t); -extern ROCKSDB_LIBRARY_API void rocksdb_options_set_snap_refresh_nanos( - rocksdb_options_t*, uint64_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*, unsigned char); @@ -843,8 +852,13 @@ rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*, + int64_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write( rocksdb_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write( + rocksdb_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions( rocksdb_options_t*, uint32_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs( @@ -1247,6 +1261,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown( rocksdb_writeoptions_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri( rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*, + unsigned char); /* Compact range options */ @@ -1301,6 +1318,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n); extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads( rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env); + extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*); extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(); @@ -1368,6 +1390,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf( const char* const* file_list, const size_t list_len, const rocksdb_ingestexternalfileoptions_t* opt, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary( + rocksdb_t* db, char** errptr); + /* SliceTransform */ extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t* diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index ed7790aebb5..27b4a6f6432 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -36,6 +36,13 @@ class Cache; extern const bool kDefaultToAdaptiveMutex; +enum CacheMetadataChargePolicy { + kDontChargeCacheMetadata, + kFullChargeCacheMetadata +}; +const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy = + kFullChargeCacheMetadata; + struct LRUCacheOptions { // Capacity of the cache. size_t capacity = 0; @@ -59,7 +66,7 @@ struct LRUCacheOptions { // // See also // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority. - double high_pri_pool_ratio = 0.0; + double high_pri_pool_ratio = 0.5; // If non-nullptr will use this allocator instead of system allocator when // allocating memory for cache blocks. Call this method before you start using @@ -76,17 +83,23 @@ struct LRUCacheOptions { // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise. bool use_adaptive_mutex = kDefaultToAdaptiveMutex; + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy; + LRUCacheOptions() {} LRUCacheOptions(size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, double _high_pri_pool_ratio, std::shared_ptr _memory_allocator = nullptr, - bool _use_adaptive_mutex = kDefaultToAdaptiveMutex) + bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy _metadata_charge_policy = + kDefaultCacheMetadataChargePolicy) : capacity(_capacity), num_shard_bits(_num_shard_bits), strict_capacity_limit(_strict_capacity_limit), high_pri_pool_ratio(_high_pri_pool_ratio), memory_allocator(std::move(_memory_allocator)), - use_adaptive_mutex(_use_adaptive_mutex) {} + use_adaptive_mutex(_use_adaptive_mutex), + metadata_charge_policy(_metadata_charge_policy) {} }; // Create a new cache with a fixed size capacity. The cache is sharded @@ -99,9 +112,11 @@ struct LRUCacheOptions { // will be at least 512KB and number of shard bits will not exceed 6. extern std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits = -1, - bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.0, + bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, std::shared_ptr memory_allocator = nullptr, - bool use_adaptive_mutex = kDefaultToAdaptiveMutex); + bool use_adaptive_mutex = kDefaultToAdaptiveMutex, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy); extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts); @@ -110,10 +125,11 @@ extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts); // more detail. // // Return nullptr if it is not supported. -extern std::shared_ptr NewClockCache(size_t capacity, - int num_shard_bits = -1, - bool strict_capacity_limit = false); - +extern std::shared_ptr NewClockCache( + size_t capacity, int num_shard_bits = -1, + bool strict_capacity_limit = false, + CacheMetadataChargePolicy metadata_charge_policy = + kDefaultCacheMetadataChargePolicy); class Cache { public: // Depending on implementation, cache entries with high priority could be less @@ -122,6 +138,9 @@ class Cache { Cache(std::shared_ptr allocator = nullptr) : memory_allocator_(std::move(allocator)) {} + // No copying allowed + Cache(const Cache&) = delete; + Cache& operator=(const Cache&) = delete; // Destroys all existing entries by calling the "deleter" // function that was passed via the Insert() function. @@ -226,6 +245,9 @@ class Cache { // returns the memory size for the entries in use by the system virtual size_t GetPinnedUsage() const = 0; + // returns the charge for the specific entry in the cache. + virtual size_t GetCharge(Handle* handle) const = 0; + // Call this on shutdown if you want to speed it up. Cache will disown // any underlying data and will not free it on delete. This call will leak // memory - call this only if you're shutting down the process. @@ -247,18 +269,9 @@ class Cache { virtual std::string GetPrintableOptions() const { return ""; } - // Mark the last inserted object as being a raw data block. This will be used - // in tests. The default implementation does nothing. - virtual void TEST_mark_as_data_block(const Slice& /*key*/, - size_t /*charge*/) {} - MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } private: - // No copying allowed - Cache(const Cache&); - Cache& operator=(const Cache&); - std::shared_ptr memory_allocator_; }; diff --git a/include/rocksdb/cleanable.h b/include/rocksdb/cleanable.h index 6dba8d9531c..3a111d545e2 100644 --- a/include/rocksdb/cleanable.h +++ b/include/rocksdb/cleanable.h @@ -23,12 +23,12 @@ namespace rocksdb { class Cleanable { public: Cleanable(); - ~Cleanable(); - // No copy constructor and copy assignment allowed. Cleanable(Cleanable&) = delete; Cleanable& operator=(Cleanable&) = delete; + ~Cleanable(); + // Move constructor and move assignment is allowed. Cleanable(Cleanable&&); Cleanable& operator=(Cleanable&&); diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 46279f9a693..e30a9d01459 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -20,8 +20,22 @@ class Slice; // from multiple threads. class Comparator { public: + Comparator() : timestamp_size_(0) {} + + Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {} + + Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {} + + Comparator& operator=(const Comparator& rhs) { + if (this != &rhs) { + timestamp_size_ = rhs.timestamp_size_; + } + return *this; + } + virtual ~Comparator() {} + static const char* Type() { return "Comparator"; } // Three-way comparison. Returns value: // < 0 iff "a" < "b", // == 0 iff "a" == "b", @@ -78,6 +92,20 @@ class Comparator { // The major use case is to determine if DataBlockHashIndex is compatible // with the customized comparator. virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; } + + inline size_t timestamp_size() const { return timestamp_size_; } + + virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + return Compare(a, b); + } + + virtual int CompareTimestamp(const Slice& /*ts1*/, + const Slice& /*ts2*/) const { + return 0; + } + + private: + size_t timestamp_size_; }; // Return a builtin comparator that uses lexicographic byte-wise diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index d3cbe6016ac..db26948a432 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -339,6 +339,13 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path); + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 8bec4a56f94..bc93aeda164 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -59,6 +59,7 @@ class CompactionJobInfo; #endif extern const std::string kDefaultColumnFamilyName; +extern const std::string kPersistentStatsColumnFamilyName; struct ColumnFamilyDescriptor { std::string name; ColumnFamilyOptions options; @@ -115,6 +116,10 @@ struct IngestExternalFileArg { IngestExternalFileOptions options; }; +struct GetMergeOperandsOptions { + int expected_max_number_of_operands = 0; +}; + // A collections of table properties objects, where // key: is the table's file name. // value: the table properties object of the given table. @@ -232,9 +237,13 @@ class DB { // status in case there are any errors. This will not fsync the WAL files. // If syncing is required, the caller must first call SyncWAL(), or Write() // using an empty write batch with WriteOptions.sync=true. - // Regardless of the return status, the DB must be freed. If the return - // status is NotSupported(), then the DB implementation does cleanup in the - // destructor + // Regardless of the return status, the DB must be freed. + // If the return status is Aborted(), closing fails because there is + // unreleased snapshot in the system. In this case, users can release + // the unreleased snapshots and try again and expect it to succeed. For + // other status, recalling Close() will be no-op. + // If the return status is NotSupported(), then the DB implementation does + // cleanup in the destructor virtual Status Close() { return Status::NotSupported(); } // ListColumnFamilies will open the DB specified by argument name @@ -246,6 +255,10 @@ class DB { std::vector* column_families); DB() {} + // No copying allowed + DB(const DB&) = delete; + void operator=(const DB&) = delete; + virtual ~DB(); // Create a column_family and return the handle of column family @@ -398,6 +411,22 @@ class DB { return Get(options, DefaultColumnFamily(), key, value); } + // Returns all the merge operands corresponding to the key. If the + // number of merge operands in DB is greater than + // merge_operands_options.expected_max_number_of_operands + // no merge operands are returned and status is Incomplete. Merge operands + // returned are in the order of insertion. + // merge_operands- Points to an array of at-least + // merge_operands_options.expected_max_number_of_operands and the + // caller is responsible for allocating it. If the status + // returned is Incomplete then number_of_operands will contain + // the total number of merge operands found in DB for key. + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* merge_operands, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) = 0; + // If keys[i] does not exist in the database, then the i'th returned // status will be one for which Status::IsNotFound() is true, and // (*values)[i] will be set to some arbitrary value (often ""). Otherwise, @@ -461,6 +490,47 @@ class DB { values++; } } + + // Overloaded MultiGet API that improves performance by batching operations + // in the read path for greater efficiency. Currently, only the block based + // table format with full filters are supported. Other table formats such + // as plain table, block based table with block based filters and + // partitioned indexes will still work, but will not get any performance + // benefits. + // Parameters - + // options - ReadOptions + // column_family - ColumnFamilyHandle* that the keys belong to. All the keys + // passed to the API are restricted to a single column family + // num_keys - Number of keys to lookup + // keys - Pointer to C style array of key Slices with num_keys elements + // values - Pointer to C style array of PinnableSlices with num_keys elements + // statuses - Pointer to C style array of Status with num_keys elements + // sorted_input - If true, it means the input keys are already sorted by key + // order, so the MultiGet() API doesn't have to sort them + // again. If false, the keys will be copied and sorted + // internally by the API - the input array will not be + // modified + virtual void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool /*sorted_input*/ = false) { + std::vector cf; + std::vector user_keys; + std::vector status; + std::vector vals; + + for (size_t i = 0; i < num_keys; ++i) { + cf.emplace_back(column_families[i]); + user_keys.emplace_back(keys[i]); + } + status = MultiGet(options, cf, user_keys, &vals); + std::copy(status.begin(), status.end(), statuses); + for (auto& value : vals) { + values->PinSelf(value); + values++; + } + } + // If the key definitely does not exist in the database, then this method // returns false, else true. If the caller wants to obtain value when the key // is found in memory, a bool for 'value_found' must be passed. 'value_found' @@ -803,7 +873,7 @@ class DB { // stats should be included, or file stats approximation or both enum SizeApproximationFlags : uint8_t { NONE = 0, - INCLUDE_MEMTABLES = 1, + INCLUDE_MEMTABLES = 1 << 0, INCLUDE_FILES = 1 << 1 }; @@ -813,14 +883,24 @@ class DB { // Note that the returned sizes measure file system space usage, so // if the user data compresses by a factor of ten, the returned // sizes will be one-tenth the size of the corresponding user data size. - // - // If include_flags defines whether the returned size should include - // the recently written data in the mem-tables (if - // the mem-table type supports it), data serialized to disk, or both. - // include_flags should be of type DB::SizeApproximationFlags + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* range, int n, + uint64_t* sizes) = 0; + + // Simpler versions of the GetApproximateSizes() method above. + // The include_flags argumenbt must of type DB::SizeApproximationFlags + // and can not be NONE. virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, const Range* range, int n, uint64_t* sizes, - uint8_t include_flags = INCLUDE_FILES) = 0; + uint8_t include_flags = INCLUDE_FILES) { + SizeApproximationOptions options; + options.include_memtabtles = + (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0; + options.include_files = + (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0; + GetApproximateSizes(options, column_family, range, n, sizes); + } virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) { GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags); @@ -960,6 +1040,9 @@ class DB { virtual Status EnableAutoCompaction( const std::vector& column_family_handles) = 0; + virtual void DisableManualCompaction() = 0; + virtual void EnableManualCompaction() = 0; + // Number of levels used for this DB. virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } @@ -1088,6 +1171,28 @@ class DB { // Retrieve the sorted list of all wal files with earliest file first virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0; + // Retrieve information about the current wal file + // + // Note that the log might have rolled after this call in which case + // the current_log_file would not point to the current log file. + // + // Additionally, for the sake of optimization current_log_file->StartSequence + // would always be set to 0 + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) = 0; + + // Retrieves the creation time of the oldest file in the DB. + // This API only works if max_open_files = -1, if it is not then + // Status returned is Status::NotSupported() + // The file creation time is set using the env provided to the DB. + // If the DB was created from a very old release then its possible that + // the SST files might not have file_creation_time property and even after + // moving to a newer release its possible that some files never got compacted + // and may not have file_creation_time property. In both the cases + // file_creation_time is considered 0 which means this API will return + // creation_time = 0 as there wouldn't be a timestamp lower than 0. + virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0; + // Note: this API is not yet consistent with WritePrepared transactions. // Sets iter to an iterator that is positioned at a write-batch containing // seq_number. If the sequence number is non existent, it returns an iterator @@ -1169,7 +1274,30 @@ class DB { virtual Status IngestExternalFiles( const std::vector& args) = 0; - virtual Status VerifyChecksum() = 0; + // CreateColumnFamilyWithImport() will create a new column family with + // column_family_name and import external SST files specified in metadata into + // this column family. + // (1) External SST files can be created using SstFileWriter. + // (2) External SST files can be exported from a particular column family in + // an existing DB. + // Option in import_options specifies whether the external files are copied or + // moved (default is copy). When option specifies copy, managing files at + // external_file_path is caller's responsibility. When option specifies a + // move, the call ensures that the specified files at external_file_path are + // deleted on successful return and files are not modified on any error + // return. + // On error return, column family handle returned will be nullptr. + // ColumnFamily will be present on successful return and will not be present + // on error return. ColumnFamily may be present on any crash during this call. + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) = 0; + + virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; + + virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } // AddFile() is deprecated, please use IngestExternalFile() ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( @@ -1313,13 +1441,25 @@ class DB { virtual Status EndTrace() { return Status::NotSupported("EndTrace() is not implemented."); } + + // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing. + virtual Status StartBlockCacheTrace( + const TraceOptions& /*options*/, + std::unique_ptr&& /*trace_writer*/) { + return Status::NotSupported("StartBlockCacheTrace() is not implemented."); + } + + virtual Status EndBlockCacheTrace() { + return Status::NotSupported("EndBlockCacheTrace() is not implemented."); + } #endif // ROCKSDB_LITE // Needed for StackableDB virtual DB* GetRootDB() { return this; } - // Given a time window, return an iterator for accessing stats history - // User is responsible for deleting StatsHistoryIterator after use + // Given a window [start_time, end_time), setup a StatsHistoryIterator + // to access stats history. Note the start_time and end_time are epoch + // time measured in seconds, and end_time is an exclusive bound. virtual Status GetStatsHistory( uint64_t /*start_time*/, uint64_t /*end_time*/, std::unique_ptr* /*stats_iterator*/) { @@ -1342,11 +1482,6 @@ class DB { return Status::NotSupported("Supported only by secondary instance"); } #endif // !ROCKSDB_LITE - - private: - // No copying allowed - DB(const DB&); - void operator=(const DB&); }; // Destroy the contents of the specified database. diff --git a/include/rocksdb/db_stress_tool.h b/include/rocksdb/db_stress_tool.h new file mode 100644 index 00000000000..2ae54980e9a --- /dev/null +++ b/include/rocksdb/db_stress_tool.h @@ -0,0 +1,9 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +namespace rocksdb { +int db_stress_tool(int argc, char** argv); +} // namespace rocksdb diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 8f6bd607228..e70a49ffc43 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -41,6 +41,7 @@ namespace rocksdb { +class DynamicLibrary; class FileLock; class Logger; class RandomAccessFile; @@ -117,10 +118,10 @@ struct EnvOptions { bool fallocate_with_keep_size = true; // See DBOptions doc - size_t compaction_readahead_size; + size_t compaction_readahead_size = 0; // See DBOptions doc - size_t random_access_max_buffer_size; + size_t random_access_max_buffer_size = 0; // See DBOptions doc size_t writable_file_max_buffer_size = 1024 * 1024; @@ -140,9 +141,21 @@ class Env { }; Env() : thread_status_updater_(nullptr) {} + // No copying allowed + Env(const Env&) = delete; + void operator=(const Env&) = delete; virtual ~Env(); + static const char* Type() { return "Environment"; } + + // Loads the environment specified by the input value into the result + static Status LoadEnv(const std::string& value, Env** result); + + // Loads the environment specified by the input value into the result + static Status LoadEnv(const std::string& value, Env** result, + std::shared_ptr* guard); + // Return a default environment suitable for the current operating // system. Sophisticated users may wish to provide their own Env // implementation instead of relying on this default environment. @@ -338,6 +351,18 @@ class Env { // REQUIRES: lock has not already been unlocked. virtual Status UnlockFile(FileLock* lock) = 0; + // Opens `lib_name` as a dynamic library. + // If the 'search_path' is specified, breaks the path into its components + // based on the appropriate platform separator (";" or ";") and looks for the + // library in those directories. If 'search path is not specified, uses the + // default library path search mechanism (such as LD_LIBRARY_PATH). On + // success, stores a dynamic library in `*result`. + virtual Status LoadLibrary(const std::string& /*lib_name*/, + const std::string& /*search_path */, + std::shared_ptr* /*result*/) { + return Status::NotSupported("LoadLibrary is not implemented in this Env"); + } + // Priority for scheduling job in thread pool enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL }; @@ -382,9 +407,11 @@ class Env { // same directory. virtual Status GetTestDirectory(std::string* path) = 0; - // Create and return a log file for storing informational messages. + // Create and returns a default logger (an instance of EnvLogger) for storing + // informational messages. Derived classes can overide to provide custom + // logger. virtual Status NewLogger(const std::string& fname, - std::shared_ptr* result) = 0; + std::shared_ptr* result); // Returns the number of micro-seconds since some fixed point in time. // It is often used as system time such as in GenericRateLimiter @@ -501,17 +528,14 @@ class Env { return Status::NotSupported(); } + virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {} + // If you're adding methods here, remember to add them to EnvWrapper too. protected: // The pointer to an internal structure that will update the // status of each thread. ThreadStatusUpdater* thread_status_updater_; - - private: - // No copying allowed - Env(const Env&); - void operator=(const Env&); }; // The factory function to construct a ThreadStatusUpdater. Any Env @@ -570,6 +594,26 @@ class SequentialFile { // SequentialFileWrapper too. }; +// A read IO request structure for use in MultiRead +struct ReadRequest { + // File offset in bytes + uint64_t offset; + + // Length to read in bytes + size_t len; + + // A buffer that MultiRead() can optionally place data in. It can + // ignore this and allocate its own buffer + char* scratch; + + // Output parameter set by MultiRead() to point to the data buffer, and + // the number of valid bytes + Slice result; + + // Status of read + Status status; +}; + // A file abstraction for randomly reading the contents of a file. class RandomAccessFile { public: @@ -594,6 +638,22 @@ class RandomAccessFile { return Status::OK(); } + // Read a bunch of blocks as described by reqs. The blocks can + // optionally be read in parallel. This is a synchronous call, i.e it + // should return after all reads have completed. The reads will be + // non-overlapping. If the function return Status is not ok, status of + // individual requests will be ignored and return status will be assumed + // for all read requests. The function return status is only meant for any + // any errors that occur before even processing specific read requests + virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) { + assert(reqs != nullptr); + for (size_t i = 0; i < num_reqs; ++i) { + ReadRequest& req = reqs[i]; + req.status = Read(req.offset, req.len, &req.result, req.scratch); + } + return Status::OK(); + } + // Tries to get an unique ID for this file that will be the same each time // the file is opened (and will stay the same while the file is open). // Furthermore, it tries to make this ID at most "max_size" bytes. If such an @@ -655,6 +715,9 @@ class WritableFile { io_priority_(Env::IO_TOTAL), write_hint_(Env::WLTH_NOT_SET), strict_bytes_per_sync_(options.strict_bytes_per_sync) {} + // No copying allowed + WritableFile(const WritableFile&) = delete; + void operator=(const WritableFile&) = delete; virtual ~WritableFile(); @@ -814,9 +877,6 @@ class WritableFile { private: size_t last_preallocated_block_; size_t preallocation_block_size_; - // No copying allowed - WritableFile(const WritableFile&); - void operator=(const WritableFile&); protected: Env::IOPriority io_priority_; @@ -828,6 +888,10 @@ class WritableFile { class RandomRWFile { public: RandomRWFile() {} + // No copying allowed + RandomRWFile(const RandomRWFile&) = delete; + RandomRWFile& operator=(const RandomRWFile&) = delete; + virtual ~RandomRWFile() {} // Indicates if the class makes use of direct I/O @@ -858,10 +922,6 @@ class RandomRWFile { // If you're adding methods here, remember to add them to // RandomRWFileWrapper too. - - // No copying allowed - RandomRWFile(const RandomRWFile&) = delete; - RandomRWFile& operator=(const RandomRWFile&) = delete; }; // MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer. @@ -919,6 +979,10 @@ class Logger { explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) : closed_(false), log_level_(log_level) {} + // No copying allowed + Logger(const Logger&) = delete; + void operator=(const Logger&) = delete; + virtual ~Logger(); // Close the log file. Must be called before destructor. If the return @@ -960,9 +1024,6 @@ class Logger { bool closed_; private: - // No copying allowed - Logger(const Logger&); - void operator=(const Logger&); InfoLogLevel log_level_; }; @@ -974,8 +1035,29 @@ class FileLock { private: // No copying allowed - FileLock(const FileLock&); - void operator=(const FileLock&); + FileLock(const FileLock&) = delete; + void operator=(const FileLock&) = delete; +}; + +class DynamicLibrary { + public: + virtual ~DynamicLibrary() {} + + // Returns the name of the dynamic library. + virtual const char* Name() const = 0; + + // Loads the symbol for sym_name from the library and updates the input + // function. Returns the loaded symbol. + template + Status LoadFunction(const std::string& sym_name, std::function* function) { + assert(nullptr != function); + void* ptr = nullptr; + Status s = LoadSymbol(sym_name, &ptr); + *function = reinterpret_cast(ptr); + return s; + } + // Loads and returns the symbol for sym_name from the library. + virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0; }; extern void LogFlush(const std::shared_ptr& info_log); @@ -1168,6 +1250,12 @@ class EnvWrapper : public Env { Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); } + Status LoadLibrary(const std::string& lib_name, + const std::string& search_path, + std::shared_ptr* result) override { + return target_->LoadLibrary(lib_name, search_path, result); + } + void Schedule(void (*f)(void* arg), void* a, Priority pri, void* tag = nullptr, void (*u)(void* arg) = nullptr) override { return target_->Schedule(f, a, pri, tag, u); @@ -1277,6 +1365,9 @@ class EnvWrapper : public Env { Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { return target_->GetFreeSpace(path, diskfree); } + void SanitizeEnvOptions(EnvOptions* env_opts) const override { + target_->SanitizeEnvOptions(env_opts); + } private: Env* target_; @@ -1315,6 +1406,9 @@ class RandomAccessFileWrapper : public RandomAccessFile { char* scratch) const override { return target_->Read(offset, n, result, scratch); } + Status MultiRead(ReadRequest* reqs, size_t num_reqs) override { + return target_->MultiRead(reqs, num_reqs); + } Status Prefetch(uint64_t offset, size_t n) override { return target_->Prefetch(offset, n); } @@ -1484,4 +1578,10 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname); // This is a factory method for TimedEnv defined in utilities/env_timed.cc. Env* NewTimedEnv(Env* base_env); +// Returns an instance of logger that can be used for storing informational +// messages. +// This is a factory method for EnvLogger declared in logging/env_logging.h +Status NewEnvLogger(const std::string& fname, Env* env, + std::shared_ptr* result); + } // namespace rocksdb diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 950fbe616ea..ad6862d5f38 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -25,9 +25,12 @@ #include #include +#include "rocksdb/advanced_options.h" + namespace rocksdb { class Slice; +struct BlockBasedTableOptions; // A class that takes a bunch of keys, then generates filter class FilterBitsBuilder { @@ -44,12 +47,13 @@ class FilterBitsBuilder { // The ownership of actual data is set to buf virtual Slice Finish(std::unique_ptr* buf) = 0; - // Calculate num of entries fit into a space. + // Calculate num of keys that can be added and generate a filter + // <= the specified number of bytes. #if defined(_MSC_VER) #pragma warning(push) #pragma warning(disable : 4702) // unreachable code #endif - virtual int CalculateNumEntry(const uint32_t /*space*/) { + virtual int CalculateNumEntry(const uint32_t /*bytes*/) { #ifndef ROCKSDB_LITE throw std::runtime_error("CalculateNumEntry not Implemented"); #else @@ -79,6 +83,26 @@ class FilterBitsReader { } }; +// Contextual information passed to BloomFilterPolicy at filter building time. +// Used in overriding FilterPolicy::GetBuilderWithContext(). +struct FilterBuildingContext { + // This constructor is for internal use only and subject to change. + FilterBuildingContext(const BlockBasedTableOptions& table_options); + + // Options for the table being built + const BlockBasedTableOptions& table_options; + + // Name of the column family for the table (or empty string if unknown) + std::string column_family_name; + + // The compactions style in effect for the table + CompactionStyle compaction_style = kCompactionStyleLevel; + + // The table level at time of constructing the SST file, or -1 if unknown. + // (The table file could later be used at a different level.) + int level_at_creation = -1; +}; + // We add a new format of filter block called full filter block // This new interface gives you more space of customization // @@ -119,13 +143,26 @@ class FilterPolicy { // list, but it should aim to return false with a high probability. virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; - // Get the FilterBitsBuilder, which is ONLY used for full filter block - // It contains interface to take individual key, then generate filter + // Return a new FilterBitsBuilder for full or partitioned filter blocks, or + // nullptr if using block-based filter. + // NOTE: This function is only called by GetBuilderWithContext() below for + // custom FilterPolicy implementations. Thus, it is not necessary to + // override this function if overriding GetBuilderWithContext(). virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; } - // Get the FilterBitsReader, which is ONLY used for full filter block - // It contains interface to tell if key can be in filter - // The input slice should NOT be deleted by FilterPolicy + // A newer variant of GetFilterBitsBuilder that allows a FilterPolicy + // to customize the builder for contextual constraints and hints. + // (Name changed to avoid triggering -Werror=overloaded-virtual.) + // If overriding GetFilterBitsBuilder() suffices, it is not necessary to + // override this function. + virtual FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const { + return GetFilterBitsBuilder(); + } + + // Return a new FilterBitsReader for full or partitioned filter blocks, or + // nullptr if using block-based filter. + // As here, the input slice should NOT be deleted by FilterPolicy. virtual FilterBitsReader* GetFilterBitsReader( const Slice& /*contents*/) const { return nullptr; @@ -135,10 +172,14 @@ class FilterPolicy { // Return a new filter policy that uses a bloom filter with approximately // the specified number of bits per key. // -// bits_per_key: bits per key in bloom filter. A good value for bits_per_key -// is 10, which yields a filter with ~ 1% false positive rate. -// use_block_based_builder: use block based filter rather than full filter. -// If you want to builder full filter, it needs to be set to false. +// bits_per_key: average bits allocated per key in bloom filter. A good +// choice is 9.9, which yields a filter with ~ 1% false positive rate. +// When format_version < 5, the value will be rounded to the nearest +// integer. Recommend using no more than three decimal digits after the +// decimal point, as in 6.667. +// +// use_block_based_builder: use deprecated block based filter (true) rather +// than full or partitioned filter (false). // // Callers must delete the result after any database that is using the // result has been closed. @@ -151,5 +192,5 @@ class FilterPolicy { // FilterPolicy (like NewBloomFilterPolicy) that does not ignore // trailing spaces in keys. extern const FilterPolicy* NewBloomFilterPolicy( - int bits_per_key, bool use_block_based_builder = false); + double bits_per_key, bool use_block_based_builder = false); } // namespace rocksdb diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index e99b434a019..162e262e328 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -28,6 +28,10 @@ namespace rocksdb { class Iterator : public Cleanable { public: Iterator() {} + // No copying allowed + Iterator(const Iterator&) = delete; + void operator=(const Iterator&) = delete; + virtual ~Iterator() {} // An iterator is either positioned at a key/value pair, or @@ -104,11 +108,6 @@ class Iterator : public Cleanable { // Get the user-key portion of the internal key at which the iteration // stopped. virtual Status GetProperty(std::string prop_name, std::string* prop); - - private: - // No copying allowed - Iterator(const Iterator&); - void operator=(const Iterator&); }; // Return an empty iterator (yields nothing). diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 5be55cbede8..57bb1eeb0d4 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -170,6 +170,10 @@ struct FlushJobInfo { std::string cf_name; // the path to the newly created file std::string file_path; + // the file number of the newly created file + uint64_t file_number; + // the oldest blob file referenced by the newly created file + uint64_t oldest_blob_file_number; // the id of the thread that completed this flush job. uint64_t thread_id; // the job id, which is unique in the same thread. @@ -194,11 +198,18 @@ struct FlushJobInfo { FlushReason flush_reason; }; -struct CompactionJobInfo { - CompactionJobInfo() = default; - explicit CompactionJobInfo(const CompactionJobStats& _stats) - : stats(_stats) {} +struct CompactionFileInfo { + // The level of the file. + int level; + + // The file number of the file. + uint64_t file_number; + // The file number of the oldest blob file this SST file references. + uint64_t oldest_blob_file_number; +}; + +struct CompactionJobInfo { // the id of the column family where the compaction happened. uint32_t cf_id; // the name of the column family where the compaction happened. @@ -213,11 +224,25 @@ struct CompactionJobInfo { int base_input_level; // the output level of the compaction. int output_level; - // the names of the compaction input files. + + // The following variables contain information about compaction inputs + // and outputs. A file may appear in both the input and output lists + // if it was simply moved to a different level. The order of elements + // is the same across input_files and input_file_infos; similarly, it is + // the same across output_files and output_file_infos. + + // The names of the compaction input files. std::vector input_files; - // the names of the compaction output files. + // Additional information about the compaction input files. + std::vector input_file_infos; + + // The names of the compaction output files. std::vector output_files; + + // Additional information about the compaction output files. + std::vector output_file_infos; + // Table properties for input and output tables. // The map is keyed by values from input_files and output_files. TablePropertiesCollection table_properties; @@ -459,6 +484,7 @@ class EventListener { #else class EventListener {}; +struct FlushJobInfo {}; #endif // ROCKSDB_LITE diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 328422f5703..7f18a581e97 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -120,6 +120,28 @@ class MemTableRep { return true; } + // Same as ::InsertWithHint, but allow concurrnet write + // + // If hint points to nullptr, a new hint will be allocated on heap, otherwise + // the hint will be updated to reflect the last insert location. The hint is + // owned by the caller and it is the caller's responsibility to delete the + // hint later. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to InsertConcurrently() by default. + virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) { + // Ignore the hint by default. + InsertConcurrently(handle); + } + + // Same as ::InsertWithHintConcurrently + // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and + // the already exists. + virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) { + InsertWithHintConcurrently(handle, hint); + return true; + } + // Like Insert(handle), but may be called concurrent with other calls // to InsertConcurrently for other handles. // diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index d8ddcc6a097..36f47e254ed 100644 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -46,6 +46,7 @@ class Logger; class MergeOperator { public: virtual ~MergeOperator() {} + static const char* Type() { return "MergeOperator"; } // Gives the client a way to express the read -> modify -> write semantics // key: (IN) The key that's associated with this merge operation. diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index a0ab41efdfb..fecee84304d 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -55,25 +55,25 @@ struct LevelMetaData { struct SstFileMetaData { SstFileMetaData() : size(0), - name(""), - db_path(""), + file_number(0), smallest_seqno(0), largest_seqno(0), - smallestkey(""), - largestkey(""), num_reads_sampled(0), being_compacted(false), num_entries(0), - num_deletions(0) {} + num_deletions(0), + oldest_blob_file_number(0) {} - SstFileMetaData(const std::string& _file_name, const std::string& _path, - size_t _size, SequenceNumber _smallest_seqno, - SequenceNumber _largest_seqno, + SstFileMetaData(const std::string& _file_name, uint64_t _file_number, + const std::string& _path, size_t _size, + SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, const std::string& _smallestkey, const std::string& _largestkey, uint64_t _num_reads_sampled, - bool _being_compacted) + bool _being_compacted, uint64_t _oldest_blob_file_number, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time) : size(_size), name(_file_name), + file_number(_file_number), db_path(_path), smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno), @@ -82,12 +82,17 @@ struct SstFileMetaData { num_reads_sampled(_num_reads_sampled), being_compacted(_being_compacted), num_entries(0), - num_deletions(0) {} + num_deletions(0), + oldest_blob_file_number(_oldest_blob_file_number), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time) {} // File size in bytes. size_t size; // The name of the file. std::string name; + // The id of the file. + uint64_t file_number; // The full path where the file locates. std::string db_path; @@ -100,6 +105,18 @@ struct SstFileMetaData { uint64_t num_entries; uint64_t num_deletions; + + uint64_t oldest_blob_file_number; // The id of the oldest blob file + // referenced by the file. + // An SST file may be generated by compactions whose input files may + // in turn be generated by earlier compactions. The creation time of the + // oldest SST file that is the compaction ancester of this file. + // The timestamp is provided Env::GetCurrentTime(). + // 0 if the information is not available. + uint64_t oldest_ancester_time; + // Timestamp when the SST file is created, provided by Env::GetCurrentTime(). + // 0 if the information is not available. + uint64_t file_creation_time; }; // The full set of metadata associated with each SST file. @@ -108,4 +125,11 @@ struct LiveFileMetaData : SstFileMetaData { int level; // Level at which this file resides. LiveFileMetaData() : column_family_name(), level(0) {} }; + +// Metadata returned as output from ExportColumnFamily() and used as input to +// CreateColumnFamiliesWithImport(). +struct ExportImportFilesMetaData { + std::string db_comparator_name; // Used to safety check at import. + std::vector files; // Vector of file metadata. +}; } // namespace rocksdb diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index ab856bee8e1..624aa524573 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -269,16 +269,8 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint64_t max_bytes_for_level_base = 256 * 1048576; - // If non-zero, compactions will periodically refresh the snapshot list. The - // delay for the first refresh is snap_refresh_nanos nano seconds and - // exponentially increases afterwards. When having many short-lived snapshots, - // this option helps reducing the cpu usage of long-running compactions. The - // feature is disabled when max_subcompactions is greater than one. - // - // Default: 0.5s - // - // Dynamically changeable through SetOptions() API - uint64_t snap_refresh_nanos = 500 * 1000 * 1000; // 0.5s + // Deprecated. + uint64_t snap_refresh_nanos = 0; // Disable automatic compactions. Manual compactions can still // be issued on this column family @@ -694,6 +686,18 @@ struct DBOptions { // Default: 600 unsigned int stats_persist_period_sec = 600; + // If true, automatically persist stats to a hidden column family (column + // family name: ___rocksdb_stats_history___) every + // stats_persist_period_sec seconds; otherwise, write to an in-memory + // struct. User can query through `GetStatsHistory` API. + // If user attempts to create a column family with the same name on a DB + // which have previously set persist_stats_to_disk to true, the column family + // creation will fail, but the hidden column family will survive, as well as + // the previously persisted statistics. + // When peristing stats to disk, the stat name will be limited at 100 bytes. + // Default: false + bool persist_stats_to_disk = false; + // if not zero, periodically take stats snapshots and store in memory, the // memory size for stats snapshots is capped at stats_history_buffer_size // Default: 1MB @@ -748,6 +752,8 @@ struct DBOptions { // for this mode if using block-based table. // // Default: false + // This flag has no affect on the behavior of compaction and plan to delete + // in the future. bool new_table_reader_for_compaction_inputs = false; // If non-zero, we perform bigger reads when doing compaction. If you're @@ -893,6 +899,32 @@ struct DBOptions { // Default: false bool enable_pipelined_write = false; + // Setting unordered_write to true trades higher write throughput with + // relaxing the immutability guarantee of snapshots. This violates the + // repeatability one expects from ::Get from a snapshot, as well as + // ::MultiGet and Iterator's consistent-point-in-time view property. + // If the application cannot tolerate the relaxed guarantees, it can implement + // its own mechanisms to work around that and yet benefit from the higher + // throughput. Using TransactionDB with WRITE_PREPARED write policy and + // two_write_queues=true is one way to achieve immutable snapshots despite + // unordered_write. + // + // By default, i.e., when it is false, rocksdb does not advance the sequence + // number for new snapshots unless all the writes with lower sequence numbers + // are already finished. This provides the immutability that we except from + // snapshots. Moreover, since Iterator and MultiGet internally depend on + // snapshots, the snapshot immutability results into Iterator and MultiGet + // offering consistent-point-in-time view. If set to true, although + // Read-Your-Own-Write property is still provided, the snapshot immutability + // property is relaxed: the writes issued after the snapshot is obtained (with + // larger sequence numbers) will be still not visible to the reads from that + // snapshot, however, there still might be pending writes (with lower sequence + // number) that will change the state visible to the snapshot after they are + // landed to the memtable. + // + // Default: false + bool unordered_write = false; + // If true, allow multi-writers to update mem tables in parallel. // Only some memtable_factory-s support concurrent writes; currently it // is implemented only for SkipListFactory. Concurrent memtable writes @@ -911,6 +943,13 @@ struct DBOptions { // Default: true bool enable_write_thread_adaptive_yield = true; + // The maximum limit of number of bytes that are written in a single batch + // of WAL or memtable write. It is followed when the leader write size + // is larger than 1/8 of this limit. + // + // Default: 1 MB + uint64_t max_write_batch_group_size_bytes = 1 << 20; + // The maximum number of microseconds that a write operation will use // a yielding spin loop to coordinate with other write threads before // blocking on a mutex. (Assuming write_thread_slow_yield_usec is @@ -1047,6 +1086,24 @@ struct DBOptions { // If set to true, takes precedence over // ReadOptions::background_purge_on_iterator_cleanup. bool avoid_unnecessary_blocking_io = false; + + // Historically DB ID has always been stored in Identity File in DB folder. + // If this flag is true, the DB ID is written to Manifest file in addition + // to the Identity file. By doing this 2 problems are solved + // 1. We don't checksum the Identity file where as Manifest file is. + // 2. Since the source of truth for DB is Manifest file DB ID will sit with + // the source of truth. Previously the Identity file could be copied + // independent of Manifest and that can result in wrong DB ID. + // We recommend setting this flag to true. + // Default: false + bool write_dbid_to_manifest = false; + + // The number of bytes to prefetch when reading the log. This is mostly useful + // for reading a remotely located log, as it can save the number of + // round-trips. If 0, then the prefetching is disabled. + // + // Default: 0 + size_t log_readahead_size = 0; }; // Options to control the behavior of a database (passed to DB::Open) @@ -1229,6 +1286,14 @@ struct ReadOptions { // Default: 0 (don't filter by seqnum, return user keys) SequenceNumber iter_start_seqnum; + // Timestamp of operation. Read should return the latest data visible to the + // specified timestamp. All timestamps of the same database must be of the + // same length and format. The user is responsible for providing a customized + // compare function via Comparator to order tuples. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp; + ReadOptions(); ReadOptions(bool cksum, bool cache); }; @@ -1281,12 +1346,34 @@ struct WriteOptions { // Default: false bool low_pri; + // If true, this writebatch will maintain the last insert positions of each + // memtable as hints in concurrent write. It can improve write performance + // in concurrent writes if keys in one writebatch are sequential. In + // non-concurrent writes (when concurrent_memtable_writes is false) this + // option will be ignored. + // + // Default: false + bool memtable_insert_hint_per_batch; + + // Timestamp of write operation, e.g. Put. All timestamps of the same + // database must share the same length and format. The user is also + // responsible for providing a customized compare function via Comparator to + // order tuples. If the user wants to enable timestamp, then + // all write operations must be associated with timestamp because RocksDB, as + // a single-node storage engine currently has no knowledge of global time, + // thus has to rely on the application. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp; + WriteOptions() : sync(false), disableWAL(false), ignore_missing_column_families(false), no_slowdown(false), - low_pri(false) {} + low_pri(false), + memtable_insert_hint_per_batch(false), + timestamp(nullptr) {} }; // Options that control flush operations @@ -1372,6 +1459,8 @@ struct CompactRangeOptions { struct IngestExternalFileOptions { // Can be set to true to move the files instead of copying them. bool move_files = false; + // If set to true, ingestion falls back to copy when move fails. + bool failed_move_fall_back_to_copy = true; // If set to false, an ingested file keys could appear in existing snapshots // that where created before the file was ingested. bool snapshot_consistency = true; @@ -1406,6 +1495,13 @@ struct IngestExternalFileOptions { // Warning: setting this to true causes slowdown in file ingestion because // the external SST file has to be read. bool verify_checksums_before_ingest = false; + // When verify_checksums_before_ingest = true, RocksDB uses default + // readahead setting to scan the file while verifying checksums before + // ingestion. + // Users can override the default value using this option. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. + size_t verify_checksums_readahead_size = 0; }; enum TraceFilterType : uint64_t { @@ -1429,4 +1525,30 @@ struct TraceOptions { uint64_t filter = kTraceFilterNone; }; +// ImportColumnFamilyOptions is used by ImportColumnFamily() +struct ImportColumnFamilyOptions { + // Can be set to true to move the files instead of copying them. + bool move_files = false; +}; + +// Options used with DB::GetApproximateSizes() +struct SizeApproximationOptions { + // Defines whether the returned size should include the recently written + // data in the mem-tables. If set to false, include_files must be true. + bool include_memtabtles = false; + // Defines whether the returned size should include data serialized to disk. + // If set to false, include_memtabtles must be true. + bool include_files = true; + // When approximating the files total size that is used to store a keys range + // using DB::GetApproximateSizes, allow approximation with an error margin of + // up to total_files_size * files_size_error_margin. This allows to take some + // shortcuts in files size approximation, resulting in better performance, + // while guaranteeing the resulting error is within a reasonable margin. + // E.g., if the value is 0.1, then the error margin of the returned files size + // approximation will be within 10%. + // If the value is non-positive - a more precise yet more CPU intensive + // estimation is performed. + double files_size_error_margin = -1.0; +}; + } // namespace rocksdb diff --git a/include/rocksdb/sst_file_reader.h b/include/rocksdb/sst_file_reader.h index 517907dd501..522a8d9a1df 100644 --- a/include/rocksdb/sst_file_reader.h +++ b/include/rocksdb/sst_file_reader.h @@ -33,7 +33,9 @@ class SstFileReader { std::shared_ptr GetTableProperties() const; // Verifies whether there is corruption in this table. - Status VerifyChecksum(); + Status VerifyChecksum(const ReadOptions& /*read_options*/); + + Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } private: struct Rep; diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 3b2b2e048c7..b6b78ef99a3 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -324,6 +324,8 @@ enum Tickers : uint32_t { TXN_DUPLICATE_KEY_OVERHEAD, // # of times snapshot_mutex_ is acquired in the fast path. TXN_SNAPSHOT_MUTEX_OVERHEAD, + // # of times ::Get returned TryAgain due to expired snapshot seq + TXN_GET_TRY_AGAIN, // Number of keys actually found in MultiGet calls (vs number requested by // caller) @@ -447,6 +449,10 @@ struct HistogramData { double min = 0.0; }; +// StatsLevel can be used to reduce statistics overhead by skipping certain +// types of stats in the stats collection process. +// Usage: +// options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); enum StatsLevel : uint8_t { // Disable timer stats, and skip histogram stats kExceptHistogramOrTimers, @@ -464,11 +470,19 @@ enum StatsLevel : uint8_t { kAll, }; -// Analyze the performance of a db +// Analyze the performance of a db by providing cumulative stats over time. +// Usage: +// Options options; +// options.statistics = rocksdb::CreateDBStatistics(); +// Status s = DB::Open(options, kDBPath, &db); +// ... +// options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); +// HistogramData hist; +// options.statistics->histogramData(FLUSH_TIME, &hist); class Statistics { public: virtual ~Statistics() {} - + static const char* Type() { return "Statistics"; } virtual uint64_t getTickerCount(uint32_t tickerType) const = 0; virtual void histogramData(uint32_t type, HistogramData* const data) const = 0; diff --git a/include/rocksdb/stats_history.h b/include/rocksdb/stats_history.h index 40ea51d1ff0..c6634ae68aa 100644 --- a/include/rocksdb/stats_history.h +++ b/include/rocksdb/stats_history.h @@ -11,7 +11,6 @@ #include #include -// #include "db/db_impl.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" @@ -19,6 +18,25 @@ namespace rocksdb { class DBImpl; +// StatsHistoryIterator is the main interface for users to programmatically +// access statistics snapshots that was automatically stored by RocksDB. +// Depending on options, the stats can be in memory or on disk. +// The stats snapshots are indexed by time that they were recorded, and each +// stats snapshot contains individual stat name and value at the time of +// recording. +// Example: +// std::unique_ptr stats_iter; +// Status s = db->GetStatsHistory(0 /* start_time */, +// env->NowMicros() /* end_time*/, +// &stats_iter); +// if (s.ok) { +// for (; stats_iter->Valid(); stats_iter->Next()) { +// uint64_t stats_time = stats_iter->GetStatsTime(); +// const std::map& stats_map = +// stats_iter->GetStatsMap(); +// process(stats_time, stats_map); +// } +// } class StatsHistoryIterator { public: StatsHistoryIterator() {} @@ -31,10 +49,12 @@ class StatsHistoryIterator { // REQUIRES: Valid() virtual void Next() = 0; - // Return the time stamp (in microseconds) when stats history is recorded. + // Return the time stamp (in seconds) when stats history is recorded. // REQUIRES: Valid() virtual uint64_t GetStatsTime() const = 0; + virtual int GetFormatVersion() const { return -1; } + // Return the current stats history as an std::map which specifies the // mapping from stats name to stats value . The underlying storage // for the returned map is valid only until the next modification of diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 12e8070d1e8..507d04168e2 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -58,7 +58,9 @@ class Status { kBusy = 11, kExpired = 12, kTryAgain = 13, - kCompactionTooLarge = 14 + kCompactionTooLarge = 14, + kColumnFamilyDropped = 15, + kMaxCode }; Code code() const { return code_; } @@ -74,6 +76,8 @@ class Status { kMemoryLimit = 7, kSpaceLimit = 8, kPathNotFound = 9, + KMergeOperandsInsufficientCapacity = 10, + kManualCompactionPaused = 11, kMaxSubCode }; @@ -184,6 +188,15 @@ class Status { return Status(kCompactionTooLarge, msg, msg2); } + static Status ColumnFamilyDropped(SubCode msg = kNone) { + return Status(kColumnFamilyDropped, msg); + } + + static Status ColumnFamilyDropped(const Slice& msg, + const Slice& msg2 = Slice()) { + return Status(kColumnFamilyDropped, msg, msg2); + } + static Status NoSpace() { return Status(kIOError, kNoSpace); } static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kIOError, kNoSpace, msg, msg2); @@ -256,6 +269,9 @@ class Status { // Returns true iff the status indicates the proposed compaction is too large bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; } + // Returns true iff the status indicates Column Family Dropped + bool IsColumnFamilyDropped() const { return code() == kColumnFamilyDropped; } + // Returns true iff the status indicates a NoSpace error // This is caused by an I/O error returning the specific "out of space" // error condition. Stricto sensu, an NoSpace error is an I/O error @@ -280,6 +296,12 @@ class Status { return (code() == kIOError) && (subcode() == kPathNotFound); } + // Returns true iff the status indicates manual compaction paused. This + // is caused by a call to PauseManualCompaction + bool IsManualCompactionPaused() const { + return (code() == kIncomplete) && (subcode() == kManualCompactionPaused); + } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 88fcc78ed8c..63dce41efc8 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -74,7 +74,7 @@ struct BlockBasedTableOptions { // blocks with high priority. If set to true, depending on implementation of // block cache, index and filter blocks may be less likely to be evicted // than data blocks. - bool cache_index_and_filter_blocks_with_high_priority = false; + bool cache_index_and_filter_blocks_with_high_priority = true; // if cache_index_and_filter_blocks is true and the below is true, then // filter and index blocks are stored in the cache, but a reference is @@ -93,14 +93,32 @@ struct BlockBasedTableOptions { enum IndexType : char { // A space efficient index block that is optimized for // binary-search-based index. - kBinarySearch, + kBinarySearch = 0x00, // The hash index, if enabled, will do the hash lookup when // `Options.prefix_extractor` is provided. - kHashSearch, + kHashSearch = 0x01, // A two-level index implementation. Both levels are binary search indexes. - kTwoLevelIndexSearch, + kTwoLevelIndexSearch = 0x02, + + // Like kBinarySearch, but index also contains first key of each block. + // This allows iterators to defer reading the block until it's actually + // needed. May significantly reduce read amplification of short range scans. + // Without it, iterator seek usually reads one block from each level-0 file + // and from each level, which may be expensive. + // Works best in combination with: + // - IndexShorteningMode::kNoShortening, + // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, + // e.g. when prefix changes. + // Makes the index significantly bigger (2x or more), especially when keys + // are long. + // + // IO errors are not handled correctly in this mode right now: if an error + // happens when lazily reading a block in value(), value() returns empty + // slice, and you need to call Valid()/status() afterwards. + // TODO(kolmike): Fix it. + kBinarySearchWithFirstKey = 0x03, }; IndexType index_type = kBinarySearch; @@ -251,6 +269,9 @@ struct BlockBasedTableOptions { // probably use this as it would reduce the index size. // This option only affects newly written tables. When reading existing // tables, the information about version is read from the footer. + // 5 -- Can be read by RocksDB's versions since X.X.X (something after 6.4.6) + // Full and partitioned filters use a generally faster and more accurate + // Bloom filter implementation, with a different schema. uint32_t format_version = 2; // Store index blocks on disk in compressed format. Changing this option to diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 7817c564965..afff2c2ac74 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -10,11 +10,7 @@ #pragma once #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include #include @@ -280,10 +276,14 @@ class BackupEngine { progress_callback); } - // deletes old backups, keeping latest num_backups_to_keep alive + // Deletes old backups, keeping latest num_backups_to_keep alive. + // See also DeleteBackup. virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; - // deletes a specific backup + // Deletes a specific backup. If this operation (or PurgeOldBackups) + // is not completed due to crash, power failure, etc. the state + // will be cleaned up the next time you call DeleteBackup, + // PurgeOldBackups, or GarbageCollect. virtual Status DeleteBackup(BackupID backup_id) = 0; // Call this from another thread if you want to stop the backup @@ -291,8 +291,8 @@ class BackupEngine { // not wait for the backup to stop. // The backup will stop ASAP and the call to CreateNewBackup will // return Status::Incomplete(). It will not clean up after itself, but - // the state will remain consistent. The state will be cleaned up - // next time you create BackupableDB or RestoreBackupableDB. + // the state will remain consistent. The state will be cleaned up the + // next time you call CreateNewBackup or GarbageCollect. virtual void StopBackup() = 0; // Returns info about backups in backup_info @@ -327,9 +327,13 @@ class BackupEngine { // Returns Status::OK() if all checks are good virtual Status VerifyBackup(BackupID backup_id) = 0; - // Will delete all the files we don't need anymore - // It will do the full scan of the files/ directory and delete all the - // files that are not referenced. + // Will delete any files left over from incomplete creation or deletion of + // a backup. This is not normally needed as those operations also clean up + // after prior incomplete calls to the same kind of operation (create or + // delete). + // NOTE: This is not designed to delete arbitrary files added to the backup + // directory outside of BackupEngine, and clean-up is always subject to + // permissions on and availability of the underlying filesystem. virtual Status GarbageCollect() = 0; }; diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h index aa0a394d4d0..5f12922c454 100644 --- a/include/rocksdb/utilities/checkpoint.h +++ b/include/rocksdb/utilities/checkpoint.h @@ -9,11 +9,15 @@ #ifndef ROCKSDB_LITE #include +#include #include "rocksdb/status.h" namespace rocksdb { class DB; +class ColumnFamilyHandle; +struct LiveFileMetaData; +struct ExportImportFilesMetaData; class Checkpoint { public: @@ -36,6 +40,16 @@ class Checkpoint { virtual Status CreateCheckpoint(const std::string& checkpoint_dir, uint64_t log_size_for_flush = 0); + // Exports all live SST files of a specified Column Family onto export_dir, + // returning SST files information in metadata. + // - SST files will be created as hard links when the directory specified + // is in the same partition as the db directory, copied otherwise. + // - export_dir should not already exist and will be created by this API. + // - Always triggers a flush. + virtual Status ExportColumnFamily(ColumnFamilyHandle* handle, + const std::string& export_dir, + ExportImportFilesMetaData** metadata); + virtual ~Checkpoint() {} }; diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h index 50645423d0a..3fc414b6edf 100644 --- a/include/rocksdb/utilities/debug.h +++ b/include/rocksdb/utilities/debug.h @@ -40,6 +40,10 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, size_t max_num_ikeys, std::vector* key_versions); +Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, + Slice end_key, size_t max_num_ikeys, + std::vector* key_versions); + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h index 57ab88a34eb..cf7d25fba2c 100644 --- a/include/rocksdb/utilities/ldb_cmd.h +++ b/include/rocksdb/utilities/ldb_cmd.h @@ -29,8 +29,10 @@ namespace rocksdb { class LDBCommand { public: // Command-line arguments + static const std::string ARG_ENV_URI; static const std::string ARG_DB; static const std::string ARG_PATH; + static const std::string ARG_SECONDARY_PATH; static const std::string ARG_HEX; static const std::string ARG_KEY_HEX; static const std::string ARG_VALUE_HEX; @@ -127,7 +129,12 @@ class LDBCommand { protected: LDBCommandExecuteResult exec_state_; + std::string env_uri_; std::string db_path_; + // If empty, open DB as primary. If non-empty, open the DB as secondary + // with this secondary path. When running against a database opened by + // another process, ldb wll leave the source directory completely intact. + std::string secondary_path_; std::string column_family_name_; DB* db_; DBWithTTL* db_ttl_; @@ -171,6 +178,9 @@ class LDBCommand { /** List of command-line options valid for this command */ const std::vector valid_cmd_line_options_; + /** Shared pointer to underlying environment if applicable **/ + std::shared_ptr env_guard_; + bool ParseKeyValue(const std::string& line, std::string* key, std::string* value, bool is_key_hex, bool is_value_hex); @@ -256,7 +266,8 @@ class LDBCommandRunner { public: static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name); - static void RunCommand( + // Returns the status code to return. 0 is no error. + static int RunCommand( int argc, char** argv, Options options, const LDBOptions& ldb_options, const std::vector* column_families); }; diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h index 86a51b92ead..d1516079a61 100644 --- a/include/rocksdb/utilities/object_registry.h +++ b/include/rocksdb/utilities/object_registry.h @@ -11,80 +11,195 @@ #include #include #include +#include #include - -#include "rocksdb/env.h" +#include "rocksdb/status.h" namespace rocksdb { - -// Creates a new T using the factory function that was registered with a pattern -// that matches the provided "target" string according to std::regex_match. -// -// If no registered functions match, returns nullptr. If multiple functions -// match, the factory function used is unspecified. -// -// Populates res_guard with result pointer if caller is granted ownership. -template -T* NewCustomObject(const std::string& target, std::unique_ptr* res_guard); - +class Logger; // Returns a new T when called with a string. Populates the std::unique_ptr // argument if granting ownership to caller. template -using FactoryFunc = std::function*)>; - -// To register a factory function for a type T, initialize a Registrar object -// with static storage duration. For example: -// -// static Registrar hdfs_reg("hdfs://.*", &CreateHdfsEnv); -// -// Then, calling NewCustomObject("hdfs://some_path", ...) will match the -// regex provided above, so it returns the result of invoking CreateHdfsEnv. -template -class Registrar { +using FactoryFunc = + std::function*, std::string*)>; + +class ObjectLibrary { public: - explicit Registrar(std::string pattern, FactoryFunc factory); -}; + // Base class for an Entry in the Registry. + class Entry { + public: + virtual ~Entry() {} + Entry(const std::string& name) : name_(std::move(name)) {} + + // Checks to see if the target matches this entry + virtual bool matches(const std::string& target) const { + return name_ == target; + } + const std::string& Name() const { return name_; } + + private: + const std::string name_; // The name of the Entry + }; // End class Entry + + // An Entry containing a FactoryFunc for creating new Objects + template + class FactoryEntry : public Entry { + public: + FactoryEntry(const std::string& name, FactoryFunc f) + : Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {} + ~FactoryEntry() override {} + bool matches(const std::string& target) const override { + return std::regex_match(target, pattern_); + } + // Creates a new T object. + T* NewFactoryObject(const std::string& target, std::unique_ptr* guard, + std::string* msg) const { + return factory_(target, guard, msg); + } -// Implementation details follow. + private: + std::regex pattern_; // The pattern for this entry + FactoryFunc factory_; + }; // End class FactoryEntry + public: + // Finds the entry matching the input name and type + const Entry* FindEntry(const std::string& type, + const std::string& name) const; + void Dump(Logger* logger) const; + + // Registers the factory with the library for the pattern. + // If the pattern matches, the factory may be used to create a new object. + template + const FactoryFunc& Register(const std::string& pattern, + const FactoryFunc& factory) { + std::unique_ptr entry(new FactoryEntry(pattern, factory)); + AddEntry(T::Type(), entry); + return factory; + } + // Returns the default ObjectLibrary + static std::shared_ptr& Default(); -namespace internal { + private: + // Adds the input entry to the list for the given type + void AddEntry(const std::string& type, std::unique_ptr& entry); -template -struct RegistryEntry { - std::regex pattern; - FactoryFunc factory; + // ** FactoryFunctions for this loader, organized by type + std::unordered_map>> entries_; }; -template -struct Registry { - static Registry* Get() { - static Registry instance; - return &instance; +// The ObjectRegistry is used to register objects that can be created by a +// name/pattern at run-time where the specific implementation of the object may +// not be known in advance. +class ObjectRegistry { + public: + static std::shared_ptr NewInstance(); + + ObjectRegistry(); + + void AddLibrary(const std::shared_ptr& library) { + libraries_.emplace_back(library); } - std::vector> entries; - private: - Registry() = default; -}; + // Creates a new T using the factory function that was registered with a + // pattern that matches the provided "target" string according to + // std::regex_match. + // + // If no registered functions match, returns nullptr. If multiple functions + // match, the factory function used is unspecified. + // + // Populates res_guard with result pointer if caller is granted ownership. + template + T* NewObject(const std::string& target, std::unique_ptr* guard, + std::string* errmsg) { + guard->reset(); + const auto* basic = FindEntry(T::Type(), target); + if (basic != nullptr) { + const auto* factory = + static_cast*>(basic); + return factory->NewFactoryObject(target, guard, errmsg); + } else { + *errmsg = std::string("Could not load ") + T::Type(); + return nullptr; + } + } + + // Creates a new unique T using the input factory functions. + // Returns OK if a new unique T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a unique ptr) + template + Status NewUniqueObject(const std::string& target, + std::unique_ptr* result) { + std::string errmsg; + T* ptr = NewObject(target, result, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (*result) { + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a unique ") + + T::Type() + " from unguarded one ", + target); + } + } -} // namespace internal + // Creates a new shared T using the input factory functions. + // Returns OK if a new shared T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return an unguarded object + // (meaning it cannot be managed by a shared ptr) + template + Status NewSharedObject(const std::string& target, + std::shared_ptr* result) { + std::string errmsg; + std::unique_ptr guard; + T* ptr = NewObject(target, &guard, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (guard) { + result->reset(guard.release()); + return Status::OK(); + } else { + return Status::InvalidArgument(std::string("Cannot make a shared ") + + T::Type() + " from unguarded one ", + target); + } + } -template -T* NewCustomObject(const std::string& target, std::unique_ptr* res_guard) { - res_guard->reset(); - for (const auto& entry : internal::Registry::Get()->entries) { - if (std::regex_match(target, entry.pattern)) { - return entry.factory(target, res_guard); + // Creates a new static T using the input factory functions. + // Returns OK if a new static T was successfully created + // Returns NotFound if the type/target could not be created + // Returns InvalidArgument if the factory return a guarded object + // (meaning it is managed by a unique ptr) + template + Status NewStaticObject(const std::string& target, T** result) { + std::string errmsg; + std::unique_ptr guard; + T* ptr = NewObject(target, &guard, &errmsg); + if (ptr == nullptr) { + return Status::NotFound(errmsg, target); + } else if (guard.get()) { + return Status::InvalidArgument(std::string("Cannot make a static ") + + T::Type() + " from a guarded one ", + target); + } else { + *result = ptr; + return Status::OK(); } } - return nullptr; -} -template -Registrar::Registrar(std::string pattern, FactoryFunc factory) { - internal::Registry::Get()->entries.emplace_back(internal::RegistryEntry{ - std::regex(std::move(pattern)), std::move(factory)}); -} + // Dump the contents of the registry to the logger + void Dump(Logger* logger) const; + + private: + const ObjectLibrary::Entry* FindEntry(const std::string& type, + const std::string& name) const; + // The set of libraries to search for factories for this registry. + // The libraries are searched in reverse order (back to front) when + // searching for entries. + std::vector> libraries_; +}; } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h index bc2a7bc13d9..fef9e9910e8 100644 --- a/include/rocksdb/utilities/sim_cache.h +++ b/include/rocksdb/utilities/sim_cache.h @@ -36,6 +36,10 @@ extern std::shared_ptr NewSimCache(std::shared_ptr cache, size_t sim_capacity, int num_shard_bits); +extern std::shared_ptr NewSimCache(std::shared_ptr sim_cache, + std::shared_ptr cache, + int num_shard_bits); + class SimCache : public Cache { public: SimCache() {} diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 6e98a48e591..c0cb7e31619 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -88,6 +88,17 @@ class StackableDB : public DB { return db_->Get(options, column_family, key, value); } + using DB::GetMergeOperands; + virtual Status GetMergeOperands( + const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* slice, + GetMergeOperandsOptions* get_merge_operands_options, + int* number_of_operands) override { + return db_->GetMergeOperands(options, column_family, key, slice, + get_merge_operands_options, + number_of_operands); + } + using DB::MultiGet; virtual std::vector MultiGet( const ReadOptions& options, @@ -120,8 +131,22 @@ class StackableDB : public DB { return db_->IngestExternalFiles(args); } + using DB::CreateColumnFamilyWithImport; + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const ExportImportFilesMetaData& metadata, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamilyWithImport(options, column_family_name, + import_options, metadata, handle); + } + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } + virtual Status VerifyChecksum(const ReadOptions& options) override { + return db_->VerifyChecksum(options); + } + using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, @@ -199,10 +224,11 @@ class StackableDB : public DB { } using DB::GetApproximateSizes; - virtual void GetApproximateSizes( - ColumnFamilyHandle* column_family, const Range* r, int n, uint64_t* sizes, - uint8_t include_flags = INCLUDE_FILES) override { - return db_->GetApproximateSizes(column_family, r, n, sizes, include_flags); + virtual Status GetApproximateSizes(const SizeApproximationOptions& options, + ColumnFamilyHandle* column_family, + const Range* r, int n, + uint64_t* sizes) override { + return db_->GetApproximateSizes(options, column_family, r, n, sizes); } using DB::GetApproximateMemTableStats; @@ -245,6 +271,13 @@ class StackableDB : public DB { return db_->EnableAutoCompaction(column_family_handles); } + virtual void EnableManualCompaction() override { + return db_->EnableManualCompaction(); + } + virtual void DisableManualCompaction() override { + return db_->DisableManualCompaction(); + } + using DB::NumberLevels; virtual int NumberLevels(ColumnFamilyHandle* column_family) override { return db_->NumberLevels(column_family); @@ -315,6 +348,16 @@ class StackableDB : public DB { db_->GetColumnFamilyMetaData(column_family, cf_meta); } + using DB::StartBlockCacheTrace; + Status StartBlockCacheTrace( + const TraceOptions& options, + std::unique_ptr&& trace_writer) override { + return db_->StartBlockCacheTrace(options, std::move(trace_writer)); + } + + using DB::EndBlockCacheTrace; + Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); } + #endif // ROCKSDB_LITE virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, @@ -335,6 +378,16 @@ class StackableDB : public DB { return db_->GetSortedWalFiles(files); } + virtual Status GetCurrentWalFile( + std::unique_ptr* current_log_file) override { + return db_->GetCurrentWalFile(current_log_file); + } + + virtual Status GetCreationTimeOfOldestFile( + uint64_t* creation_time) override { + return db_->GetCreationTimeOfOldestFile(creation_time); + } + virtual Status DeleteFile(std::string name) override { return db_->DeleteFile(name); } @@ -394,6 +447,12 @@ class StackableDB : public DB { return db_->DefaultColumnFamily(); } +#ifndef ROCKSDB_LITE + Status TryCatchUpWithPrimary() override { + return db_->TryCatchUpWithPrimary(); + } +#endif // ROCKSDB_LITE + protected: DB* db_; std::shared_ptr shared_db_ptr_; diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index a3f9f6303cb..44ce2801952 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -52,6 +52,10 @@ class TransactionNotifier { // -Support for using Transactions with DBWithTTL class Transaction { public: + // No copying allowed + Transaction(const Transaction&) = delete; + void operator=(const Transaction&) = delete; + virtual ~Transaction() {} // If a transaction has a snapshot set, the transaction will ensure that @@ -131,7 +135,7 @@ class Transaction { // Status::Busy() may be returned if the transaction could not guarantee // that there are no write conflicts. Status::TryAgain() may be returned // if the memtable history size is not large enough - // (See max_write_buffer_number_to_maintain). + // (See max_write_buffer_size_to_maintain). // // If this transaction was created by a TransactionDB(), Status::Expired() // may be returned if this transaction has lived for longer than @@ -243,7 +247,7 @@ class Transaction { // Status::Busy() if there is a write conflict, // Status::TimedOut() if a lock could not be acquired, // Status::TryAgain() if the memtable history size is not large enough - // (See max_write_buffer_number_to_maintain) + // (See max_write_buffer_size_to_maintain) // Status::MergeInProgress() if merge operations cannot be resolved. // or other errors if this key could not be read. virtual Status GetForUpdate(const ReadOptions& options, @@ -320,7 +324,7 @@ class Transaction { // Status::Busy() if there is a write conflict, // Status::TimedOut() if a lock could not be acquired, // Status::TryAgain() if the memtable history size is not large enough - // (See max_write_buffer_number_to_maintain) + // (See max_write_buffer_size_to_maintain) // or other errors on unexpected failures. virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value, const bool assume_tracked = false) = 0; @@ -522,12 +526,13 @@ class Transaction { id_ = id; } + virtual uint64_t GetLastLogNumber() const { return log_number_; } + private: friend class PessimisticTransactionDB; friend class WriteUnpreparedTxnDB; - // No copying allowed - Transaction(const Transaction&); - void operator=(const Transaction&); + friend class TransactionTest_TwoPhaseLogRollingTest_Test; + friend class TransactionTest_TwoPhaseLogRollingTest2_Test; }; } // namespace rocksdb diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 6c4346ff3e7..91a9cec2856 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -94,14 +94,32 @@ struct TransactionDBOptions { // for the special way that myrocks uses this operands. bool rollback_merge_operands = false; + // If true, the TransactionDB implementation might skip concurrency control + // unless it is overridden by TransactionOptions or + // TransactionDBWriteOptimizations. This can be used in conjuction with + // DBOptions::unordered_write when the TransactionDB is used solely for write + // ordering rather than concurrency control. + bool skip_concurrency_control = false; + + // This option is only valid for write unprepared. If a write batch exceeds + // this threshold, then the transaction will implicitly flush the currently + // pending writes into the database. A value of 0 or less means no limit. + int64_t default_write_batch_flush_threshold = 0; + private: // 128 entries size_t wp_snapshot_cache_bits = static_cast(7); // 8m entry, 64MB size size_t wp_commit_cache_bits = static_cast(23); + // For testing, whether transaction name should be auto-generated or not. This + // is useful for write unprepared which requires named transactions. + bool autogenerate_name = false; + friend class WritePreparedTxnDB; + friend class WriteUnpreparedTxn; friend class WritePreparedTransactionTestBase; + friend class TransactionTestBase; friend class MySQLStyleTransactionTest; }; @@ -155,6 +173,11 @@ struct TransactionOptions { // back/commit before new transactions start. // Default: false bool skip_concurrency_control = false; + + // See TransactionDBOptions::default_write_batch_flush_threshold for + // description. If a negative value is specified, then the default value from + // TransactionDBOptions is used. + int64_t write_batch_flush_threshold = -1; }; // The per-write optimizations that do not involve transactions. TransactionDB @@ -278,11 +301,9 @@ class TransactionDB : public StackableDB { // To Create an TransactionDB, call Open() // The ownership of db is transferred to the base StackableDB explicit TransactionDB(DB* db) : StackableDB(db) {} - - private: // No copying allowed - TransactionDB(const TransactionDB&); - void operator=(const TransactionDB&); + TransactionDB(const TransactionDB&) = delete; + void operator=(const TransactionDB&) = delete; }; } // namespace rocksdb diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index 34e6c46895c..a0b3bac99df 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -100,6 +100,8 @@ class WriteBatchWithIndex : public WriteBatchBase { size_t max_bytes = 0); ~WriteBatchWithIndex() override; + WriteBatchWithIndex(WriteBatchWithIndex&&); + WriteBatchWithIndex& operator=(WriteBatchWithIndex&&); using WriteBatchBase::Put; Status Put(ColumnFamilyHandle* column_family, const Slice& key, @@ -163,7 +165,8 @@ class WriteBatchWithIndex : public WriteBatchBase { // the write batch update finishes. The state may recover after Next() is // called. Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family, - Iterator* base_iterator); + Iterator* base_iterator, + const ReadOptions* opts = nullptr); // default column family Iterator* NewIteratorWithBase(Iterator* base_iterator); diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 2e8b496819c..24a0527897e 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -5,8 +5,8 @@ #pragma once #define ROCKSDB_MAJOR 6 -#define ROCKSDB_MINOR 1 -#define ROCKSDB_PATCH 1 +#define ROCKSDB_MINOR 6 +#define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 29b660d1987..0b42b9bd577 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "rocksdb/status.h" #include "rocksdb/write_batch_base.h" @@ -60,6 +61,7 @@ struct SavePoint { class WriteBatch : public WriteBatchBase { public: explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0); + explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz); ~WriteBatch() override; using WriteBatchBase::Put; @@ -269,7 +271,7 @@ class WriteBatch : public WriteBatchBase { virtual bool Continue(); protected: - friend class WriteBatch; + friend class WriteBatchInternal; virtual bool WriteAfterCommit() const { return true; } virtual bool WriteBeforePrepare() const { return false; } }; @@ -282,7 +284,7 @@ class WriteBatch : public WriteBatchBase { size_t GetDataSize() const { return rep_.size(); } // Returns the number of updates in the batch - int Count() const; + uint32_t Count() const; // Returns true if PutCF will be called during Iterate bool HasPut() const; @@ -311,6 +313,12 @@ class WriteBatch : public WriteBatchBase { // Returns trie if MarkRollback will be called during Iterate bool HasRollback() const; + // Assign timestamp to write batch + Status AssignTimestamp(const Slice& ts); + + // Assign timestamps to write batch + Status AssignTimestamps(const std::vector& ts_list); + using WriteBatchBase::GetWriteBatch; WriteBatch* GetWriteBatch() override { return this; } @@ -361,6 +369,7 @@ class WriteBatch : public WriteBatchBase { protected: std::string rep_; // See comment in write_batch.cc for the format of rep_ + const size_t timestamp_size_; // Intentionally copyable }; diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h index dea904c187e..a6c204a633b 100644 --- a/include/rocksdb/write_buffer_manager.h +++ b/include/rocksdb/write_buffer_manager.h @@ -26,6 +26,10 @@ class WriteBufferManager { // the memory allocated to the cache. It can be used even if _buffer_size = 0. explicit WriteBufferManager(size_t _buffer_size, std::shared_ptr cache = {}); + // No copying allowed + WriteBufferManager(const WriteBufferManager&) = delete; + WriteBufferManager& operator=(const WriteBufferManager&) = delete; + ~WriteBufferManager(); bool enabled() const { return buffer_size_ != 0; } @@ -94,9 +98,5 @@ class WriteBufferManager { void ReserveMemWithCache(size_t mem); void FreeMemWithCache(size_t mem); - - // No copying allowed - WriteBufferManager(const WriteBufferManager&) = delete; - WriteBufferManager& operator=(const WriteBufferManager&) = delete; }; } // namespace rocksdb diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index 360951834a7..b1f706c161e 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -1,5 +1,9 @@ cmake_minimum_required(VERSION 3.4) +if(${CMAKE_VERSION} VERSION_LESS "3.11.4") + message("Please consider switching to CMake 3.11.4 or newer") +endif() + set(JNI_NATIVE_SOURCES rocksjni/backupablejni.cc rocksjni/backupenginejni.cc @@ -11,9 +15,9 @@ set(JNI_NATIVE_SOURCES rocksjni/compaction_filter.cc rocksjni/compaction_filter_factory.cc rocksjni/compaction_filter_factory_jnicallback.cc - rocksjni/compaction_job_info.cc - rocksjni/compaction_job_stats.cc - rocksjni/compaction_options.cc + rocksjni/compaction_job_info.cc + rocksjni/compaction_job_stats.cc + rocksjni/compaction_options.cc rocksjni/compaction_options_fifo.cc rocksjni/compaction_options_universal.cc rocksjni/compact_range_options.cc @@ -47,6 +51,8 @@ set(JNI_NATIVE_SOURCES rocksjni/snapshot.cc rocksjni/sst_file_manager.cc rocksjni/sst_file_writerjni.cc + rocksjni/sst_file_readerjni.cc + rocksjni/sst_file_reader_iterator.cc rocksjni/statistics.cc rocksjni/statisticsjni.cc rocksjni/table.cc @@ -72,125 +78,9 @@ set(JNI_NATIVE_SOURCES rocksjni/write_buffer_manager.cc ) -set(NATIVE_JAVA_CLASSES - org.rocksdb.AbstractCompactionFilter - org.rocksdb.AbstractCompactionFilterFactory - org.rocksdb.AbstractComparator - org.rocksdb.AbstractImmutableNativeReference - org.rocksdb.AbstractNativeReference - org.rocksdb.AbstractRocksIterator - org.rocksdb.AbstractSlice - org.rocksdb.AbstractTableFilter - org.rocksdb.AbstractTraceWriter - org.rocksdb.AbstractTransactionNotifier - org.rocksdb.AbstractWalFilter - org.rocksdb.BackupableDBOptions - org.rocksdb.BackupEngine - org.rocksdb.BlockBasedTableConfig - org.rocksdb.BloomFilter - org.rocksdb.CassandraCompactionFilter - org.rocksdb.CassandraValueMergeOperator - org.rocksdb.Checkpoint - org.rocksdb.ClockCache - org.rocksdb.ColumnFamilyHandle - org.rocksdb.ColumnFamilyOptions - org.rocksdb.CompactionJobInfo - org.rocksdb.CompactionJobStats - org.rocksdb.CompactionOptions - org.rocksdb.CompactionOptionsFIFO - org.rocksdb.CompactionOptionsUniversal - org.rocksdb.CompactRangeOptions - org.rocksdb.Comparator - org.rocksdb.ComparatorOptions - org.rocksdb.CompressionOptions - org.rocksdb.DBOptions - org.rocksdb.DirectComparator - org.rocksdb.DirectSlice - org.rocksdb.Env - org.rocksdb.EnvOptions - org.rocksdb.Filter - org.rocksdb.FlushOptions - org.rocksdb.HashLinkedListMemTableConfig - org.rocksdb.HashSkipListMemTableConfig - org.rocksdb.HdfsEnv - org.rocksdb.IngestExternalFileOptions - org.rocksdb.Logger - org.rocksdb.LRUCache - org.rocksdb.MemoryUtil - org.rocksdb.MemTableConfig - org.rocksdb.NativeComparatorWrapper - org.rocksdb.NativeLibraryLoader - org.rocksdb.OptimisticTransactionDB - org.rocksdb.OptimisticTransactionOptions - org.rocksdb.Options - org.rocksdb.OptionsUtil - org.rocksdb.PersistentCache - org.rocksdb.PlainTableConfig - org.rocksdb.RateLimiter - org.rocksdb.ReadOptions - org.rocksdb.RemoveEmptyValueCompactionFilter - org.rocksdb.RestoreOptions - org.rocksdb.RocksCallbackObject - org.rocksdb.RocksDB - org.rocksdb.RocksEnv - org.rocksdb.RocksIterator - org.rocksdb.RocksIteratorInterface - org.rocksdb.RocksMemEnv - org.rocksdb.RocksMutableObject - org.rocksdb.RocksObject - org.rocksdb.SkipListMemTableConfig - org.rocksdb.Slice - org.rocksdb.Snapshot - org.rocksdb.SstFileManager - org.rocksdb.SstFileWriter - org.rocksdb.Statistics - org.rocksdb.StringAppendOperator - org.rocksdb.TableFormatConfig - org.rocksdb.ThreadStatus - org.rocksdb.TimedEnv - org.rocksdb.Transaction - org.rocksdb.TransactionDB - org.rocksdb.TransactionDBOptions - org.rocksdb.TransactionLogIterator - org.rocksdb.TransactionOptions - org.rocksdb.TtlDB - org.rocksdb.UInt64AddOperator - org.rocksdb.VectorMemTableConfig - org.rocksdb.WBWIRocksIterator - org.rocksdb.WriteBatch - org.rocksdb.WriteBatch.Handler - org.rocksdb.WriteBatchInterface - org.rocksdb.WriteBatchWithIndex - org.rocksdb.WriteOptions - org.rocksdb.NativeComparatorWrapperTest - org.rocksdb.RocksDBExceptionTest - org.rocksdb.SnapshotTest - org.rocksdb.WriteBatchTest - org.rocksdb.WriteBatchTestInternalHelper - org.rocksdb.WriteBufferManager -) - -include(FindJava) -include(UseJava) -include(FindJNI) - -include_directories(${JNI_INCLUDE_DIRS}) -include_directories(${PROJECT_SOURCE_DIR}/java) - -set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs) -set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar) -set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar) -set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar) -set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar) -set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar) -set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar) -set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR}) - -add_jar( - rocksdbjni_classes - SOURCES - src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java +set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/AbstractCompactionFilter.java + src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java src/main/java/org/rocksdb/AbstractComparator.java src/main/java/org/rocksdb/AbstractImmutableNativeReference.java src/main/java/org/rocksdb/AbstractMutableOptions.java @@ -306,6 +196,8 @@ add_jar( src/main/java/org/rocksdb/SstFileManager.java src/main/java/org/rocksdb/SstFileMetaData.java src/main/java/org/rocksdb/SstFileWriter.java + src/main/java/org/rocksdb/SstFileReader.java + src/main/java/org/rocksdb/SstFileReaderIterator.java src/main/java/org/rocksdb/StateType.java src/main/java/org/rocksdb/StatisticsCollectorCallback.java src/main/java/org/rocksdb/StatisticsCollector.java @@ -338,8 +230,8 @@ add_jar( src/main/java/org/rocksdb/WalProcessingOption.java src/main/java/org/rocksdb/WALRecoveryMode.java src/main/java/org/rocksdb/WBWIRocksIterator.java - src/main/java/org/rocksdb/WriteBatchInterface.java src/main/java/org/rocksdb/WriteBatch.java + src/main/java/org/rocksdb/WriteBatchInterface.java src/main/java/org/rocksdb/WriteBatchWithIndex.java src/main/java/org/rocksdb/WriteOptions.java src/main/java/org/rocksdb/WriteBufferManager.java @@ -348,6 +240,10 @@ add_jar( src/main/java/org/rocksdb/util/Environment.java src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java src/main/java/org/rocksdb/util/SizeUnit.java + src/main/java/org/rocksdb/UInt64AddOperator.java +) + +set(JAVA_TEST_CLASSES src/test/java/org/rocksdb/BackupEngineTest.java src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java src/test/java/org/rocksdb/NativeComparatorWrapperTest.java @@ -355,13 +251,59 @@ add_jar( src/test/java/org/rocksdb/RocksDBExceptionTest.java src/test/java/org/rocksdb/RocksMemoryResource.java src/test/java/org/rocksdb/SnapshotTest.java - src/main/java/org/rocksdb/UInt64AddOperator.java src/test/java/org/rocksdb/WriteBatchTest.java src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java src/test/java/org/rocksdb/util/WriteBatchGetter.java - INCLUDE_JARS ${JAVA_TESTCLASSPATH} ) +include(FindJava) +include(UseJava) +find_package(JNI) + +include_directories(${JNI_INCLUDE_DIRS}) +include_directories(${PROJECT_SOURCE_DIR}/java) + +set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs) +set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar) +set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar) +set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar) +set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar) +set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar) +set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar) +set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR}) + +set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include) +file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR}) + +if(${Java_VERSION_MAJOR} VERSION_GREATER_EQUAL "10" AND ${CMAKE_VERSION} VERSION_LESS "3.11.4") + # Java 10 and newer don't have javah, but the alternative GENERATE_NATIVE_HEADERS requires CMake 3.11.4 or newer + message(FATAL_ERROR "Detected Java 10 or newer (${Java_VERSION_STRING}), to build with CMake please upgrade CMake to 3.11.4 or newer") + +elseif(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1")) + # Old CMake or Java 1.7 prepare the JAR... + message("Preparing Jar for Java 7") + add_jar( + rocksdbjni_classes + SOURCES + ${JAVA_MAIN_CLASSES} + ${JAVA_TEST_CLASSES} + INCLUDE_JARS ${JAVA_TESTCLASSPATH} + ) + +else () + # Java 1.8 or newer prepare the JAR... + message("Preparing Jar for JDK ${Java_VERSION_STRING}") + add_jar( + rocksdbjni_classes + SOURCES + ${JAVA_MAIN_CLASSES} + ${JAVA_TEST_CLASSES} + INCLUDE_JARS ${JAVA_TESTCLASSPATH} + GENERATE_NATIVE_HEADERS rocksdbjni_headers DESTINATION ${JNI_OUTPUT_DIR} + ) + +endif() + if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes) file(MAKE_DIRECTORY ${PROJECT_SOURCE_DIR}/java/classes) endif() @@ -424,15 +366,116 @@ if(NOT EXISTS ${JAVA_ASSERTJ_JAR}) file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR}) endif() -set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include) +if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1")) + # Old CMake or Java 1.7 ONLY generate JNI headers, Java 1.8+ JNI is handled in add_jar step above + message("Preparing JNI headers for Java 7") + set(NATIVE_JAVA_CLASSES + org.rocksdb.AbstractCompactionFilter + org.rocksdb.AbstractCompactionFilterFactory + org.rocksdb.AbstractComparator + org.rocksdb.AbstractImmutableNativeReference + org.rocksdb.AbstractNativeReference + org.rocksdb.AbstractRocksIterator + org.rocksdb.AbstractSlice + org.rocksdb.AbstractTableFilter + org.rocksdb.AbstractTraceWriter + org.rocksdb.AbstractTransactionNotifier + org.rocksdb.AbstractWalFilter + org.rocksdb.BackupableDBOptions + org.rocksdb.BackupEngine + org.rocksdb.BlockBasedTableConfig + org.rocksdb.BloomFilter + org.rocksdb.CassandraCompactionFilter + org.rocksdb.CassandraValueMergeOperator + org.rocksdb.Checkpoint + org.rocksdb.ClockCache + org.rocksdb.ColumnFamilyHandle + org.rocksdb.ColumnFamilyOptions + org.rocksdb.CompactionJobInfo + org.rocksdb.CompactionJobStats + org.rocksdb.CompactionOptions + org.rocksdb.CompactionOptionsFIFO + org.rocksdb.CompactionOptionsUniversal + org.rocksdb.CompactRangeOptions + org.rocksdb.Comparator + org.rocksdb.ComparatorOptions + org.rocksdb.CompressionOptions + org.rocksdb.DBOptions + org.rocksdb.DirectComparator + org.rocksdb.DirectSlice + org.rocksdb.Env + org.rocksdb.EnvOptions + org.rocksdb.Filter + org.rocksdb.FlushOptions + org.rocksdb.HashLinkedListMemTableConfig + org.rocksdb.HashSkipListMemTableConfig + org.rocksdb.HdfsEnv + org.rocksdb.IngestExternalFileOptions + org.rocksdb.Logger + org.rocksdb.LRUCache + org.rocksdb.MemoryUtil + org.rocksdb.MemTableConfig + org.rocksdb.NativeComparatorWrapper + org.rocksdb.NativeLibraryLoader + org.rocksdb.OptimisticTransactionDB + org.rocksdb.OptimisticTransactionOptions + org.rocksdb.Options + org.rocksdb.OptionsUtil + org.rocksdb.PersistentCache + org.rocksdb.PlainTableConfig + org.rocksdb.RateLimiter + org.rocksdb.ReadOptions + org.rocksdb.RemoveEmptyValueCompactionFilter + org.rocksdb.RestoreOptions + org.rocksdb.RocksCallbackObject + org.rocksdb.RocksDB + org.rocksdb.RocksEnv + org.rocksdb.RocksIterator + org.rocksdb.RocksIteratorInterface + org.rocksdb.RocksMemEnv + org.rocksdb.RocksMutableObject + org.rocksdb.RocksObject + org.rocksdb.SkipListMemTableConfig + org.rocksdb.Slice + org.rocksdb.Snapshot + org.rocksdb.SstFileManager + org.rocksdb.SstFileWriter + org.rocksdb.SstFileReader + org.rocksdb.SstFileReaderIterator + org.rocksdb.Statistics + org.rocksdb.StringAppendOperator + org.rocksdb.TableFormatConfig + org.rocksdb.ThreadStatus + org.rocksdb.TimedEnv + org.rocksdb.Transaction + org.rocksdb.TransactionDB + org.rocksdb.TransactionDBOptions + org.rocksdb.TransactionLogIterator + org.rocksdb.TransactionOptions + org.rocksdb.TtlDB + org.rocksdb.UInt64AddOperator + org.rocksdb.VectorMemTableConfig + org.rocksdb.WBWIRocksIterator + org.rocksdb.WriteBatch + org.rocksdb.WriteBatch.Handler + org.rocksdb.WriteBatchInterface + org.rocksdb.WriteBatchWithIndex + org.rocksdb.WriteOptions + org.rocksdb.NativeComparatorWrapperTest + org.rocksdb.RocksDBExceptionTest + org.rocksdb.SnapshotTest + org.rocksdb.WriteBatchTest + org.rocksdb.WriteBatchTestInternalHelper + org.rocksdb.WriteBufferManager + ) -file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR}) -create_javah( - TARGET rocksdbjni_headers - CLASSES ${NATIVE_JAVA_CLASSES} - CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH} - OUTPUT_DIR ${JNI_OUTPUT_DIR} -) + create_javah( + TARGET rocksdbjni_headers + CLASSES ${NATIVE_JAVA_CLASSES} + CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH} + OUTPUT_DIR ${JNI_OUTPUT_DIR} + ) +endif() if(NOT MSVC) set_property(TARGET ${ROCKSDB_STATIC_LIB} PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/java/Makefile b/java/Makefile index efc9d2b4e11..f8642c2d612 100644 --- a/java/Makefile +++ b/java/Makefile @@ -60,6 +60,8 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\ org.rocksdb.Slice\ org.rocksdb.SstFileManager\ org.rocksdb.SstFileWriter\ + org.rocksdb.SstFileReader\ + org.rocksdb.SstFileReaderIterator\ org.rocksdb.Statistics\ org.rocksdb.ThreadStatus\ org.rocksdb.TimedEnv\ @@ -156,6 +158,7 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\ org.rocksdb.SnapshotTest\ org.rocksdb.SstFileManagerTest\ org.rocksdb.SstFileWriterTest\ + org.rocksdb.SstFileReaderTest\ org.rocksdb.TableFilterTest\ org.rocksdb.TimedEnvTest\ org.rocksdb.TransactionTest\ @@ -229,12 +232,20 @@ javalib: java java_test javadocs java: $(AM_V_GEN)mkdir -p $(MAIN_CLASSES) +ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0) $(AM_V_at)javac $(JAVAC_ARGS) -d $(MAIN_CLASSES)\ $(MAIN_SRC)/org/rocksdb/util/*.java\ $(MAIN_SRC)/org/rocksdb/*.java +else + $(AM_V_at)javac $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES)\ + $(MAIN_SRC)/org/rocksdb/util/*.java\ + $(MAIN_SRC)/org/rocksdb/*.java +endif $(AM_V_at)@cp ../HISTORY.md ./HISTORY-CPP.md $(AM_V_at)@rm -f ./HISTORY-CPP.md +ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0) $(AM_V_at)javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES) +endif sample: java $(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES) @@ -276,11 +287,18 @@ resolve_test_deps: java_test: java resolve_test_deps $(AM_V_GEN)mkdir -p $(TEST_CLASSES) +ifeq ($(shell java -version 2>&1|grep 1.7.0 >/dev/null; printf $$?),0) $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\ $(TEST_SRC)/org/rocksdb/test/*.java\ $(TEST_SRC)/org/rocksdb/util/*.java\ $(TEST_SRC)/org/rocksdb/*.java $(AM_V_at)javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES) +else + $(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\ + $(TEST_SRC)/org/rocksdb/test/*.java\ + $(TEST_SRC)/org/rocksdb/util/*.java\ + $(TEST_SRC)/org/rocksdb/*.java +endif test: java java_test run_test diff --git a/java/RELEASE.md b/java/RELEASE.md index cb9aaf987b4..dda19455f3f 100644 --- a/java/RELEASE.md +++ b/java/RELEASE.md @@ -1,31 +1,36 @@ ## Cross-building -RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be usd on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system. +RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be used on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system. Building a cross-platform JAR requires: - * [Vagrant](https://www.vagrantup.com/) - * [Virtualbox](https://www.virtualbox.org/) + * [Docker](https://www.docker.com/docker-community) * A Mac OSX machine that can compile RocksDB. * Java 7 set as JAVA_HOME. Once you have these items, run this make command from RocksDB's root source directory: - make jclean clean rocksdbjavastaticrelease + make jclean clean rocksdbjavastaticreleasedocker -This command will build RocksDB natively on OSX, and will then spin up two Vagrant Virtualbox Ubuntu images to build RocksDB for both 32-bit and 64-bit Linux. +This command will build RocksDB natively on OSX, and will then spin up docker containers to build RocksDB for 32-bit and 64-bit Linux with glibc, and 32-bit and 64-bit Linux with musl libc. You can find all native binaries and JARs in the java/target directory upon completion: librocksdbjni-linux32.so librocksdbjni-linux64.so + librocksdbjni-linux64-musl.so + librocksdbjni-linux32-musl.so librocksdbjni-osx.jnilib - rocksdbjni-3.5.0-javadoc.jar - rocksdbjni-3.5.0-linux32.jar - rocksdbjni-3.5.0-linux64.jar - rocksdbjni-3.5.0-osx.jar - rocksdbjni-3.5.0-sources.jar - rocksdbjni-3.5.0.jar + rocksdbjni-x.y.z-javadoc.jar + rocksdbjni-x.y.z-linux32.jar + rocksdbjni-x.y.z-linux64.jar + rocksdbjni-x.y.z-linux64-musl.jar + rocksdbjni-x.y.z-linux32-musl.jar + rocksdbjni-x.y.z-osx.jar + rocksdbjni-x.y.z-sources.jar + rocksdbjni-x.y.z.jar + +Where x.y.z is the built version number of RocksDB. ## Maven publication diff --git a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java index 67f6a5cc055..ff36c74a4c8 100644 --- a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java +++ b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java @@ -646,8 +646,8 @@ private void run() throws RocksDBException { currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); break; case "fillbatch": - tasks.add(new WriteRandomTask( - currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000)); + tasks.add( + new WriteSequentialTask(currentTaskId++, randSeed_, num_, num_, writeOpt, 1000)); break; case "fillrandom": tasks.add(new WriteRandomTask( @@ -901,27 +901,23 @@ public static void main(String[] args) throws Exception { } private enum Flag { - benchmarks( - Arrays.asList( - "fillseq", - "readrandom", - "fillrandom"), - "Comma-separated list of operations to run in the specified order\n" + - "\tActual benchmarks:\n" + - "\t\tfillseq -- write N values in sequential key order in async mode.\n" + - "\t\tfillrandom -- write N values in random key order in async mode.\n" + - "\t\tfillbatch -- write N/1000 batch where each batch has 1000 values\n" + - "\t\t in random key order in sync mode.\n" + - "\t\tfillsync -- write N/100 values in random key order in sync mode.\n" + - "\t\tfill100K -- write N/1000 100K values in random order in async mode.\n" + - "\t\treadseq -- read N times sequentially.\n" + - "\t\treadrandom -- read N times in random order.\n" + - "\t\treadhot -- read N times in random order from 1% section of DB.\n" + - "\t\treadwhilewriting -- measure the read performance of multiple readers\n" + - "\t\t with a bg single writer. The write rate of the bg\n" + - "\t\t is capped by --writes_per_second.\n" + - "\tMeta Operations:\n" + - "\t\tdelete -- delete DB") { + benchmarks(Arrays.asList("fillseq", "readrandom", "fillrandom"), + "Comma-separated list of operations to run in the specified order\n" + + "\tActual benchmarks:\n" + + "\t\tfillseq -- write N values in sequential key order in async mode.\n" + + "\t\tfillrandom -- write N values in random key order in async mode.\n" + + "\t\tfillbatch -- write N/1000 batch where each batch has 1000 values\n" + + "\t\t in sequential key order in sync mode.\n" + + "\t\tfillsync -- write N/100 values in random key order in sync mode.\n" + + "\t\tfill100K -- write N/1000 100K values in random order in async mode.\n" + + "\t\treadseq -- read N times sequentially.\n" + + "\t\treadrandom -- read N times in random order.\n" + + "\t\treadhot -- read N times in random order from 1% section of DB.\n" + + "\t\treadwhilewriting -- measure the read performance of multiple readers\n" + + "\t\t with a bg single writer. The write rate of the bg\n" + + "\t\t is capped by --writes_per_second.\n" + + "\tMeta Operations:\n" + + "\t\tdelete -- delete DB") { @Override public Object parseValue(String value) { return new ArrayList(Arrays.asList(value.split(","))); } diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile index 4a321774888..0ee50de2ce1 100644 --- a/java/crossbuild/Vagrantfile +++ b/java/crossbuild/Vagrantfile @@ -7,11 +7,29 @@ VAGRANTFILE_API_VERSION = "2" Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| config.vm.define "linux32" do |linux32| - linux32.vm.box = "hansode/centos-6.7-i386" + linux32.vm.box = "bento/centos-6.10-i386" + linux32.vm.provision :shell, path: "build-linux-centos.sh" end config.vm.define "linux64" do |linux64| - linux64.vm.box = "hansode/centos-6.7-x86_64" + linux64.vm.box = "bento/centos-6.10" + linux64.vm.provision :shell, path: "build-linux-centos.sh" + end + + config.vm.define "linux32-musl" do |musl32| + musl32.vm.box = "alpine/alpine32" + musl32.vm.box_version = "3.6.0" + musl32.vm.provision :shell, path: "build-linux-alpine.sh" + end + + config.vm.define "linux64-musl" do |musl64| + musl64.vm.box = "generic/alpine36" + + ## Should use the alpine/alpine64 box, but this issue needs to be fixed first - https://github.com/hashicorp/vagrant/issues/11218 + # musl64.vm.box = "alpine/alpine64" + # musl64.vm.box_version = "3.6.0" + + musl64.vm.provision :shell, path: "build-linux-alpine.sh" end config.vm.provider "virtualbox" do |v| @@ -20,7 +38,13 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| v.customize ["modifyvm", :id, "--nictype1", "virtio" ] end - config.vm.provision :shell, path: "build-linux-centos.sh" + if Vagrant.has_plugin?("vagrant-cachier") + config.cache.scope = :box + end + if Vagrant.has_plugin?("vagrant-vbguest") + config.vbguest.no_install = true + end + config.vm.synced_folder "../target", "/rocksdb-build" config.vm.synced_folder "../..", "/rocksdb", type: "rsync" config.vm.boot_timeout = 1200 diff --git a/java/crossbuild/build-linux-alpine.sh b/java/crossbuild/build-linux-alpine.sh new file mode 100755 index 00000000000..561d34141ea --- /dev/null +++ b/java/crossbuild/build-linux-alpine.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +set -e + +# update Alpine with latest versions +echo '@edge http://nl.alpinelinux.org/alpine/edge/main' >> /etc/apk/repositories +echo '@community http://nl.alpinelinux.org/alpine/edge/community' >> /etc/apk/repositories +apk update +apk upgrade + +# install CA certificates +apk add ca-certificates + +# install build tools +apk add \ + build-base \ + coreutils \ + file \ + git \ + perl \ + automake \ + autoconf \ + cmake + +# install tool dependencies for building RocksDB static library +apk add \ + curl \ + bash \ + wget \ + tar \ + openssl + +# install RocksDB dependencies +apk add \ + snappy snappy-dev \ + zlib zlib-dev \ + bzip2 bzip2-dev \ + lz4 lz4-dev \ + zstd zstd-dev \ + linux-headers \ + jemalloc jemalloc-dev + +# install OpenJDK7 +apk add openjdk7 \ + && apk add java-cacerts \ + && rm /usr/lib/jvm/java-1.7-openjdk/jre/lib/security/cacerts \ + && ln -s /etc/ssl/certs/java/cacerts /usr/lib/jvm/java-1.7-openjdk/jre/lib/security/cacerts + +# cleanup +rm -rf /var/cache/apk/* + +# puts javac in the PATH +export JAVA_HOME=/usr/lib/jvm/java-1.7-openjdk +export PATH=/usr/lib/jvm/java-1.7-openjdk/bin:$PATH + +# gflags from source +cd /tmp &&\ + git clone -b v2.0 --single-branch https://github.com/gflags/gflags.git &&\ + cd gflags &&\ + ./configure --prefix=/usr && make && make install &&\ + rm -rf /tmp/* + + +# build rocksdb +cd /rocksdb +make jclean clean +PORTABLE=1 make -j8 rocksdbjavastatic +cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build +cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh index a9b5e0a9270..176e3456ce9 100755 --- a/java/crossbuild/build-linux-centos.sh +++ b/java/crossbuild/build-linux-centos.sh @@ -10,7 +10,11 @@ sudo rm -f /etc/yum/vars/releasever sudo yum -y install epel-release # install all required packages for rocksdb that are available through yum -sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel jemalloc-devel +sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel jemalloc-devel cmake3 + +# set up cmake3 as cmake binary +sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 --slave /usr/local/bin/ctest ctest /usr/bin/ctest --slave /usr/local/bin/cpack cpack /usr/bin/cpack --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake +sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 --slave /usr/local/bin/ctest ctest /usr/bin/ctest3 --slave /usr/local/bin/cpack cpack /usr/bin/cpack3 --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3 # install gcc/g++ 4.8.2 from tru/devtools-2 sudo wget -O /etc/yum.repos.d/devtools-2.repo https://people.centos.org/tru/devtools-2/devtools-2.repo @@ -24,9 +28,11 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib # set java home so we can build rocksdb jars export JAVA_HOME=/usr/lib/jvm/java-1.7.0 +export PATH=$JAVA_HOME:/usr/local/bin:$PATH + # build rocksdb cd /rocksdb -scl enable devtoolset-2 'make jclean clean' +scl enable devtoolset-2 'make clean-not-downloaded' scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic' cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build diff --git a/java/crossbuild/docker-build-linux-alpine.sh b/java/crossbuild/docker-build-linux-alpine.sh new file mode 100755 index 00000000000..e605c7716bc --- /dev/null +++ b/java/crossbuild/docker-build-linux-alpine.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +set -e +#set -x + +# just in-case this is run outside Docker +mkdir -p /rocksdb-local-build + +rm -rf /rocksdb-local-build/* +cp -r /rocksdb-host/* /rocksdb-local-build +cd /rocksdb-local-build + +make clean-not-downloaded +PORTABLE=1 make rocksdbjavastatic + +cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target + diff --git a/java/crossbuild/docker-build-linux-centos.sh b/java/crossbuild/docker-build-linux-centos.sh index aedcb8af788..c4217785f2e 100755 --- a/java/crossbuild/docker-build-linux-centos.sh +++ b/java/crossbuild/docker-build-linux-centos.sh @@ -4,26 +4,31 @@ set -e #set -x -rm -rf /rocksdb-local -cp -r /rocksdb-host /rocksdb-local -cd /rocksdb-local +# just in-case this is run outside Docker +mkdir -p /rocksdb-local-build -# Use scl devtoolset if available (i.e. CentOS <7) +rm -rf /rocksdb-local-build/* +cp -r /rocksdb-host/* /rocksdb-local-build +cd /rocksdb-local-build + +# Use scl devtoolset if available if hash scl 2>/dev/null; then if scl --list | grep -q 'devtoolset-7'; then - scl enable devtoolset-7 'make jclean clean' - scl enable devtoolset-7 'PORTABLE=1 make -j6 rocksdbjavastatic' + # CentOS 7+ + scl enable devtoolset-7 'make clean-not-downloaded' + scl enable devtoolset-7 'PORTABLE=1 make -j2 rocksdbjavastatic' elif scl --list | grep -q 'devtoolset-2'; then - scl enable devtoolset-2 'make jclean clean' - scl enable devtoolset-2 'PORTABLE=1 make -j6 rocksdbjavastatic' + # CentOS 5 or 6 + scl enable devtoolset-2 'make clean-not-downloaded' + scl enable devtoolset-2 'PORTABLE=1 make -j2 rocksdbjavastatic' else echo "Could not find devtoolset" exit 1; fi else - make jclean clean - PORTABLE=1 make -j6 rocksdbjavastatic + make clean-not-downloaded + PORTABLE=1 make -j2 rocksdbjavastatic fi -cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-host/java/target +cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target diff --git a/java/rocksjni.pom b/java/rocksjni.pom index 94f07551c36..5defdca7d46 100644 --- a/java/rocksjni.pom +++ b/java/rocksjni.pom @@ -10,7 +10,7 @@ rocksdbjni - - RocksDB fat jar that contains .so files for linux32 and linux64, jnilib files + RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files for Mac OSX, and a .dll for Windows x64. diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc index 5e9c63643de..c4c275fb529 100644 --- a/java/rocksjni/filter.cc +++ b/java/rocksjni/filter.cc @@ -19,10 +19,10 @@ /* * Class: org_rocksdb_BloomFilter * Method: createBloomFilter - * Signature: (IZ)J + * Signature: (DZ)J */ jlong Java_org_rocksdb_BloomFilter_createNewBloomFilter( - JNIEnv* /*env*/, jclass /*jcls*/, jint bits_per_key, + JNIEnv* /*env*/, jclass /*jcls*/, jdouble bits_per_key, jboolean use_block_base_builder) { auto* sptr_filter = new std::shared_ptr( rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder)); diff --git a/java/rocksjni/loggerjnicallback.cc b/java/rocksjni/loggerjnicallback.cc index 61571e98712..a731fdac96e 100644 --- a/java/rocksjni/loggerjnicallback.cc +++ b/java/rocksjni/loggerjnicallback.cc @@ -131,7 +131,6 @@ void LoggerJniCallback::Logv(const InfoLogLevel log_level, const char* format, } assert(format != nullptr); - assert(ap != nullptr); const std::unique_ptr msg = format_str(format, ap); // pass msg to java callback handler diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 12f44b5eb09..33d42646fad 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -1547,6 +1547,27 @@ jboolean Java_org_rocksdb_Options_enablePipelinedWrite( return static_cast(opt->enable_pipelined_write); } +/* + * Class: org_rocksdb_Options + * Method: setUnorderedWrite + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setUnorderedWrite( + JNIEnv*, jobject, jlong jhandle, jboolean unordered_write) { + reinterpret_cast(jhandle) + ->unordered_write = static_cast(unordered_write); +} + +/* + * Class: org_rocksdb_Options + * Method: unorderedWrite + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_unorderedWrite( + JNIEnv*, jobject, jlong jhandle) { + return reinterpret_cast(jhandle)->unordered_write; +} + /* * Class: org_rocksdb_Options * Method: setAllowConcurrentMemtableWrite @@ -5717,6 +5738,29 @@ jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite( return static_cast(opt->enable_pipelined_write); } +/* + * Class: org_rocksdb_DBOptions + * Method: setUnorderedWrite + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setUnorderedWrite( + JNIEnv*, jobject, jlong jhandle, jboolean junordered_write) { + auto* opt = reinterpret_cast(jhandle); + opt->unordered_write = junordered_write == JNI_TRUE; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: unorderedWrite + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_unorderedWrite( + JNIEnv*, jobject, jlong jhandle) { +auto* opt = reinterpret_cast(jhandle); +return static_cast(opt->unordered_write); +} + + /* * Class: org_rocksdb_DBOptions * Method: setEnableThreadTracking diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 193804ac318..e9dc3fb82b1 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -467,6 +467,8 @@ class StatusJni : public RocksDBNativeClass { return 0xC; case rocksdb::Status::Code::kTryAgain: return 0xD; + case rocksdb::Status::Code::kColumnFamilyDropped: + return 0xE; default: return 0x7F; // undefined } @@ -584,6 +586,12 @@ class StatusJni : public RocksDBNativeClass { new rocksdb::Status(rocksdb::Status::TryAgain( rocksdb::SubCodeJni::toCppSubCode(jsub_code_value)))); break; + case 0xE: + // ColumnFamilyDropped + status = std::unique_ptr( + new rocksdb::Status(rocksdb::Status::ColumnFamilyDropped( + rocksdb::SubCodeJni::toCppSubCode(jsub_code_value)))); + break; case 0x7F: default: return nullptr; @@ -4612,6 +4620,8 @@ class TickerTypeJni { return -0x0B; case rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD: return -0x0C; + case rocksdb::Tickers::TXN_GET_TRY_AGAIN: + return -0x0D; case rocksdb::Tickers::TICKER_ENUM_MAX: // 0x5F for backwards compatibility on current minor version. return 0x5F; @@ -4904,6 +4914,8 @@ class TickerTypeJni { return rocksdb::Tickers::TXN_DUPLICATE_KEY_OVERHEAD; case -0x0C: return rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD; + case -0x0D: + return rocksdb::Tickers::TXN_GET_TRY_AGAIN; case 0x5F: // 0x5F for backwards compatibility on current minor version. return rocksdb::Tickers::TICKER_ENUM_MAX; @@ -5894,8 +5906,10 @@ class IndexTypeJni { return 0x0; case rocksdb::BlockBasedTableOptions::IndexType::kHashSearch: return 0x1; - case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch: + case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch: return 0x2; + case rocksdb::BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey: + return 0x3; default: return 0x7F; // undefined } @@ -5912,6 +5926,9 @@ class IndexTypeJni { return rocksdb::BlockBasedTableOptions::IndexType::kHashSearch; case 0x2: return rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + case 0x3: + return rocksdb::BlockBasedTableOptions::IndexType:: + kBinarySearchWithFirstKey; default: // undefined/default return rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch; diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index 53224232c83..58c06fae8ad 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -18,6 +18,7 @@ #include "include/org_rocksdb_RocksDB.h" #include "rocksdb/cache.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/types.h" @@ -3044,3 +3045,73 @@ void Java_org_rocksdb_RocksDB_destroyDB( rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } } + +bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index, + std::unique_ptr& slice, + std::vector>& ranges_to_free) { + jobject jArray = env->GetObjectArrayElement(ranges, index); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + return false; + } + + if (jArray == nullptr) { + return true; + } + + jbyteArray jba = reinterpret_cast(jArray); + jsize len_ba = env->GetArrayLength(jba); + ranges_to_free.push_back(std::unique_ptr(new jbyte[len_ba])); + env->GetByteArrayRegion(jba, 0, len_ba, ranges_to_free.back().get()); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jArray); + return false; + } + env->DeleteLocalRef(jArray); + slice.reset(new rocksdb::Slice( + reinterpret_cast(ranges_to_free.back().get()), len_ba)); + return true; +} +/* + * Class: org_rocksdb_RocksDB + * Method: deleteFilesInRanges + * Signature: (JJLjava/util/List;Z)V + */ +JNIEXPORT void JNICALL Java_org_rocksdb_RocksDB_deleteFilesInRanges( + JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jlong jcf_handle, + jobjectArray ranges, jboolean include_end) { + jsize length = env->GetArrayLength(ranges); + + std::vector rangesVector; + std::vector> slices; + std::vector> ranges_to_free; + for (jsize i = 0; (i + 1) < length; i += 2) { + slices.push_back(std::unique_ptr()); + if (!get_slice_helper(env, ranges, i, slices.back(), ranges_to_free)) { + // exception thrown + return; + } + + slices.push_back(std::unique_ptr()); + if (!get_slice_helper(env, ranges, i + 1, slices.back(), ranges_to_free)) { + // exception thrown + return; + } + + rangesVector.push_back(rocksdb::RangePtr(slices[slices.size() - 2].get(), + slices[slices.size() - 1].get())); + } + + auto* db = reinterpret_cast(jdb_handle); + auto* column_family = + reinterpret_cast(jcf_handle); + + rocksdb::Status s = rocksdb::DeleteFilesInRanges( + db, column_family == nullptr ? db->DefaultColumnFamily() : column_family, + rangesVector.data(), rangesVector.size(), include_end); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} diff --git a/java/rocksjni/sst_file_reader_iterator.cc b/java/rocksjni/sst_file_reader_iterator.cc new file mode 100644 index 00000000000..4cbbf04bdc6 --- /dev/null +++ b/java/rocksjni/sst_file_reader_iterator.cc @@ -0,0 +1,191 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::Iterator methods from Java side. + +#include +#include +#include + +#include "include/org_rocksdb_SstFileReaderIterator.h" +#include "rocksdb/iterator.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReaderIterator_disposeInternal(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong handle) { + auto* it = reinterpret_cast(handle); + assert(it != nullptr); + delete it; +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: isValid0 + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong handle) { + return reinterpret_cast(handle)->Valid(); +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: seekToFirst0 + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong handle) { + reinterpret_cast(handle)->SeekToFirst(); +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: seekToLast0 + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReaderIterator_seekToLast0(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong handle) { + reinterpret_cast(handle)->SeekToLast(); +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: next0 + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong handle) { + reinterpret_cast(handle)->Next(); +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: prev0 + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong handle) { + reinterpret_cast(handle)->Prev(); +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: seek0 + * Signature: (J[BI)V + */ +void Java_org_rocksdb_SstFileReaderIterator_seek0(JNIEnv* env, jobject /*jobj*/, + jlong handle, + jbyteArray jtarget, + jint jtarget_len) { + jbyte* target = env->GetByteArrayElements(jtarget, nullptr); + if (target == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + + rocksdb::Slice target_slice(reinterpret_cast(target), jtarget_len); + + auto* it = reinterpret_cast(handle); + it->Seek(target_slice); + + env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT); +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: seekForPrev0 + * Signature: (J[BI)V + */ +void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env, + jobject /*jobj*/, + jlong handle, + jbyteArray jtarget, + jint jtarget_len) { + jbyte* target = env->GetByteArrayElements(jtarget, nullptr); + if (target == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + + rocksdb::Slice target_slice(reinterpret_cast(target), jtarget_len); + + auto* it = reinterpret_cast(handle); + it->SeekForPrev(target_slice); + + env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT); +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: status0 + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env, + jobject /*jobj*/, + jlong handle) { + auto* it = reinterpret_cast(handle); + rocksdb::Status s = it->status(); + + if (s.ok()) { + return; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: key0 + * Signature: (J)[B + */ +jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env, + jobject /*jobj*/, + jlong handle) { + auto* it = reinterpret_cast(handle); + rocksdb::Slice key_slice = it->key(); + + jbyteArray jkey = env->NewByteArray(static_cast(key_slice.size())); + if (jkey == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + env->SetByteArrayRegion( + jkey, 0, static_cast(key_slice.size()), + const_cast(reinterpret_cast(key_slice.data()))); + return jkey; +} + +/* + * Class: org_rocksdb_SstFileReaderIterator + * Method: value0 + * Signature: (J)[B + */ +jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env, jobject /*jobj*/, + jlong handle) { + auto* it = reinterpret_cast(handle); + rocksdb::Slice value_slice = it->value(); + + jbyteArray jkeyValue = + env->NewByteArray(static_cast(value_slice.size())); + if(jkeyValue == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } + env->SetByteArrayRegion(jkeyValue, 0, static_cast(value_slice.size()), + const_cast(reinterpret_cast(value_slice.data()))); + return jkeyValue; +} \ No newline at end of file diff --git a/java/rocksjni/sst_file_readerjni.cc b/java/rocksjni/sst_file_readerjni.cc new file mode 100644 index 00000000000..c8348c2e256 --- /dev/null +++ b/java/rocksjni/sst_file_readerjni.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ and enables +// calling C++ rocksdb::SstFileReader methods +// from Java side. + +#include +#include + +#include "include/org_rocksdb_SstFileReader.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/sst_file_reader.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_SstFileReader + * Method: newSstFileReader + * Signature: (J)J + */ +jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/, + jclass /*jcls*/, + jlong joptions) { + auto *options = reinterpret_cast(joptions); + rocksdb::SstFileReader *sst_file_reader = + new rocksdb::SstFileReader(*options); + return reinterpret_cast(sst_file_reader); +} + +/* + * Class: org_rocksdb_SstFileReader + * Method: open + * Signature: (JLjava/lang/String;)V + */ +void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jobject /*jobj*/, + jlong jhandle, jstring jfile_path) { + const char *file_path = env->GetStringUTFChars(jfile_path, nullptr); + if (file_path == nullptr) { + // exception thrown: OutOfMemoryError + return; + } + rocksdb::Status s = + reinterpret_cast(jhandle)->Open(file_path); + env->ReleaseStringUTFChars(jfile_path, file_path); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_SstFileReader + * Method: newIterator + * Signature: (JJ)J + */ +jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/, + jobject /*jobj*/, + jlong jhandle, + jlong jread_options_handle) { + auto *sst_file_reader = reinterpret_cast(jhandle); + auto *read_options = + reinterpret_cast(jread_options_handle); + return reinterpret_cast(sst_file_reader->NewIterator(*read_options)); +} + +/* + * Class: org_rocksdb_SstFileReader + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReader_disposeInternal(JNIEnv * /*env*/, + jobject /*jobj*/, + jlong jhandle) { + delete reinterpret_cast(jhandle); +} + +/* + * Class: org_rocksdb_SstFileReader + * Method: verifyChecksum + * Signature: (J)V + */ +void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, + jobject /*jobj*/, + jlong jhandle) { + auto *sst_file_reader = reinterpret_cast(jhandle); + auto s = sst_file_reader->VerifyChecksum(); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_SstFileReader + * Method: getTableProperties + * Signature: (J)J + */ +jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env, + jobject /*jobj*/, + jlong jhandle) { + auto *sst_file_reader = reinterpret_cast(jhandle); + std::shared_ptr tp = + sst_file_reader->GetTableProperties(); + jobject jtable_properties = + rocksdb::TablePropertiesJni::fromCppTableProperties(env, *(tp.get())); + return jtable_properties; +} diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index 1ccc550ab62..a4504d917ab 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -85,7 +85,7 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( std::shared_ptr *pCache = reinterpret_cast *>(jblock_cache_handle); options.block_cache = *pCache; - } else if (jblock_cache_size > 0) { + } else if (jblock_cache_size >= 0) { if (jblock_cache_num_shard_bits > 0) { options.block_cache = rocksdb::NewLRUCache( static_cast(jblock_cache_size), @@ -94,6 +94,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.block_cache = rocksdb::NewLRUCache( static_cast(jblock_cache_size)); } + } else { + options.no_block_cache = true; + options.block_cache = nullptr; } } if (jpersistent_cache_handle > 0) { diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index f1b77446c02..c6d0b9072ae 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -11,6 +11,7 @@ #include "db/write_batch_internal.h" #include "include/org_rocksdb_WriteBatch.h" #include "include/org_rocksdb_WriteBatch_Handler.h" +#include "logging/logging.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" @@ -20,7 +21,6 @@ #include "rocksjni/portal.h" #include "rocksjni/writebatchhandlerjnicallback.h" #include "table/scoped_arena_iterator.h" -#include "util/logging.h" /* * Class: org_rocksdb_WriteBatch diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 266fb4abf74..4b51b12781b 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -22,8 +22,8 @@ #include "rocksdb/write_buffer_manager.h" #include "rocksjni/portal.h" #include "table/scoped_arena_iterator.h" +#include "test_util/testharness.h" #include "util/string_util.h" -#include "util/testharness.h" /* * Class: org_rocksdb_WriteBatchTest @@ -52,9 +52,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env, mem->Ref(); std::string state; rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem); - rocksdb::Status s = - rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr); - int count = 0; + rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto( + b, &cf_mems_default, nullptr, nullptr); + unsigned int count = 0; rocksdb::Arena arena; rocksdb::ScopedArenaIterator iter( mem->NewIterator(rocksdb::ReadOptions(), &arena)); diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java index ac8550f3ef7..91e3b2fa2b0 100644 --- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java @@ -13,9 +13,8 @@ * * Taken from include/rocksdb/advanced_options.h */ -public interface AdvancedColumnFamilyOptionsInterface - { - +public interface AdvancedColumnFamilyOptionsInterface< + T extends AdvancedColumnFamilyOptionsInterface> { /** * The minimum number of write buffers that will be merged together * before writing to storage. If set to 1, then diff --git a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java index 3ec46712389..03a7b098352 100644 --- a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java @@ -11,9 +11,8 @@ * Taken from include/rocksdb/advanced_options.h * and MutableCFOptions in util/cf_options.h */ -public interface AdvancedMutableColumnFamilyOptionsInterface - { - +public interface AdvancedMutableColumnFamilyOptionsInterface< + T extends AdvancedMutableColumnFamilyOptionsInterface> { /** * The maximum number of write buffers that are built up in memory. * The default is 2, so that when 1 write buffer is being flushed to diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index 4c88a0224c6..bf5c0c1a921 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -725,7 +725,7 @@ public long blockCacheSize() { /** * Set the size of the cache in bytes that will be used by RocksDB. - * If cacheSize is non-positive, then cache will not be used. + * If cacheSize is negative, then cache will not be used. * DEFAULT: 8M * * @param blockCacheSize block cache size in bytes diff --git a/java/src/main/java/org/rocksdb/BloomFilter.java b/java/src/main/java/org/rocksdb/BloomFilter.java index 316c3ad838b..0a119878a46 100644 --- a/java/src/main/java/org/rocksdb/BloomFilter.java +++ b/java/src/main/java/org/rocksdb/BloomFilter.java @@ -20,7 +20,7 @@ */ public class BloomFilter extends Filter { - private static final int DEFAULT_BITS_PER_KEY = 10; + private static final double DEFAULT_BITS_PER_KEY = 10.0; private static final boolean DEFAULT_MODE = true; /** @@ -39,7 +39,7 @@ public BloomFilter() { * *

* bits_per_key: bits per key in bloom filter. A good value for bits_per_key - * is 10, which yields a filter with ~ 1% false positive rate. + * is 9.9, which yields a filter with ~ 1% false positive rate. *

*

* Callers must delete the result after any database that is using the @@ -47,7 +47,7 @@ public BloomFilter() { * * @param bitsPerKey number of bits to use */ - public BloomFilter(final int bitsPerKey) { + public BloomFilter(final double bitsPerKey) { this(bitsPerKey, DEFAULT_MODE); } @@ -70,10 +70,10 @@ public BloomFilter(final int bitsPerKey) { * @param bitsPerKey number of bits to use * @param useBlockBasedMode use block based mode or full filter mode */ - public BloomFilter(final int bitsPerKey, final boolean useBlockBasedMode) { + public BloomFilter(final double bitsPerKey, final boolean useBlockBasedMode) { super(createNewBloomFilter(bitsPerKey, useBlockBasedMode)); } - private native static long createNewBloomFilter(final int bitsKeyKey, + private native static long createNewBloomFilter(final double bitsKeyKey, final boolean useBlockBasedMode); } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java index f88a21af2b0..6d8dfd161c6 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java @@ -5,10 +5,8 @@ package org.rocksdb; -public interface ColumnFamilyOptionsInterface - - extends AdvancedColumnFamilyOptionsInterface { - +public interface ColumnFamilyOptionsInterface> + extends AdvancedColumnFamilyOptionsInterface { /** * Use this if your DB is very small (like under 1GB) and you don't want to * spend lots of memory for memtables. diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java index e2c4c02b32e..fc413c76a68 100644 --- a/java/src/main/java/org/rocksdb/DBOptions.java +++ b/java/src/main/java/org/rocksdb/DBOptions.java @@ -872,6 +872,18 @@ public boolean enablePipelinedWrite() { return enablePipelinedWrite(nativeHandle_); } + @Override + public DBOptions setUnorderedWrite(final boolean unorderedWrite) { + setUnorderedWrite(nativeHandle_, unorderedWrite); + return this; + } + + @Override + public boolean unorderedWrite() { + return unorderedWrite(nativeHandle_); + } + + @Override public DBOptions setAllowConcurrentMemtableWrite( final boolean allowConcurrentMemtableWrite) { @@ -1266,6 +1278,9 @@ private native void setEnableThreadTracking(long handle, private native void setEnablePipelinedWrite(final long handle, final boolean enablePipelinedWrite); private native boolean enablePipelinedWrite(final long handle); + private native void setUnorderedWrite(final long handle, + final boolean unorderedWrite); + private native boolean unorderedWrite(final long handle); private native void setAllowConcurrentMemtableWrite(long handle, boolean allowConcurrentMemtableWrite); private native boolean allowConcurrentMemtableWrite(long handle); diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java index af9aa179bf4..a26449c5d38 100644 --- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -8,8 +8,7 @@ import java.util.Collection; import java.util.List; -public interface DBOptionsInterface { - +public interface DBOptionsInterface> { /** * Use this if your DB is very small (like under 1GB) and you don't want to * spend lots of memory for memtables. @@ -1089,6 +1088,44 @@ T setNewTableReaderForCompactionInputs( */ boolean enablePipelinedWrite(); + /** + * Setting {@link #unorderedWrite()} to true trades higher write throughput with + * relaxing the immutability guarantee of snapshots. This violates the + * repeatability one expects from ::Get from a snapshot, as well as + * ::MultiGet and Iterator's consistent-point-in-time view property. + * If the application cannot tolerate the relaxed guarantees, it can implement + * its own mechanisms to work around that and yet benefit from the higher + * throughput. Using TransactionDB with WRITE_PREPARED write policy and + * {@link #twoWriteQueues()} true is one way to achieve immutable snapshots despite + * unordered_write. + * + * By default, i.e., when it is false, rocksdb does not advance the sequence + * number for new snapshots unless all the writes with lower sequence numbers + * are already finished. This provides the immutability that we except from + * snapshots. Moreover, since Iterator and MultiGet internally depend on + * snapshots, the snapshot immutability results into Iterator and MultiGet + * offering consistent-point-in-time view. If set to true, although + * Read-Your-Own-Write property is still provided, the snapshot immutability + * property is relaxed: the writes issued after the snapshot is obtained (with + * larger sequence numbers) will be still not visible to the reads from that + * snapshot, however, there still might be pending writes (with lower sequence + * number) that will change the state visible to the snapshot after they are + * landed to the memtable. + * + * @param unorderedWrite true to enabled unordered write + * + * @return the reference to the current options. + */ + T setUnorderedWrite(final boolean unorderedWrite); + + /** + * Returns true if unordered write are enabled. + * See {@link #setUnorderedWrite(boolean)}. + * + * @return true if unordered write are enabled, false otherwise. + */ + boolean unorderedWrite(); + /** * If true, allow multi-writers to update mem tables in parallel. * Only some memtable factorys support concurrent writes; currently it diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java index c2efcc54b6b..be3a5d483ba 100644 --- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java @@ -5,10 +5,9 @@ package org.rocksdb; -public interface MutableColumnFamilyOptionsInterface - - extends AdvancedMutableColumnFamilyOptionsInterface { - +public interface MutableColumnFamilyOptionsInterface< + T extends MutableColumnFamilyOptionsInterface> + extends AdvancedMutableColumnFamilyOptionsInterface { /** * Amount of data to build up in memory (backed by an unsorted log * on disk) before converting to a sorted on-disk file. @@ -21,7 +20,7 @@ public interface MutableColumnFamilyOptionsInterface * Also, a larger write buffer will result in a longer recovery time * the next time the database is opened. * - * Default: 4MB + * Default: 64MB * @param writeBufferSize the size of write buffer. * @return the instance of the current object. * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms diff --git a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java index 1715d69d093..50347d38d53 100644 --- a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java @@ -1,8 +1,7 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. package org.rocksdb; -public interface MutableDBOptionsInterface { - +public interface MutableDBOptionsInterface> { /** * Specifies the maximum number of concurrent background jobs (both flushes * and compactions combined). diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index 5831b1e298e..9fce1eda24e 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -16,7 +16,7 @@ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). * * If {@link #dispose()} function is not called, then it will be GC'd - * automaticallyand native resources will be released as part of the process. + * automatically and native resources will be released as part of the process. */ public class Options extends RocksObject implements DBOptionsInterface, @@ -919,6 +919,17 @@ public boolean enablePipelinedWrite() { return enablePipelinedWrite(nativeHandle_); } + @Override + public Options setUnorderedWrite(final boolean unorderedWrite) { + setUnorderedWrite(nativeHandle_, unorderedWrite); + return this; + } + + @Override + public boolean unorderedWrite() { + return unorderedWrite(nativeHandle_); + } + @Override public Options setAllowConcurrentMemtableWrite( final boolean allowConcurrentMemtableWrite) { @@ -1886,6 +1897,9 @@ private native void setEnableThreadTracking(long handle, private native void setEnablePipelinedWrite(final long handle, final boolean pipelinedWrite); private native boolean enablePipelinedWrite(final long handle); + private native void setUnorderedWrite(final long handle, + final boolean unorderedWrite); + private native boolean unorderedWrite(final long handle); private native void setAllowConcurrentMemtableWrite(long handle, boolean allowConcurrentMemtableWrite); private native boolean allowConcurrentMemtableWrite(long handle); diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index b93a51e28a4..0920886c40f 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -3834,6 +3834,32 @@ public void endTrace() throws RocksDBException { endTrace(nativeHandle_); } + /* + * Delete files in multiple ranges at once + * Delete files in a lot of ranges one at a time can be slow, use this API for + * better performance in that case. + * @param columnFamily - The column family for operation (null for default) + * @param includeEnd - Whether ranges should include end + * @param ranges - pairs of ranges (from1, to1, from2, to2, ...) + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void deleteFilesInRanges(final ColumnFamilyHandle columnFamily, final List ranges, + final boolean includeEnd) throws RocksDBException { + if (ranges.size() == 0) { + return; + } + if ((ranges.size() % 2) != 0) { + throw new IllegalArgumentException("Ranges size needs to be multiple of 2 " + + "(from1, to1, from2, to2, ...), but is " + ranges.size()); + } + + final byte[][] rangesArray = ranges.toArray(new byte[ranges.size()][]); + + deleteFilesInRanges(nativeHandle_, columnFamily == null ? 0 : columnFamily.nativeHandle_, + rangesArray, includeEnd); + } + /** * Static method to destroy the contents of the specified database. * Be very careful using this method. @@ -4171,7 +4197,8 @@ private native void promoteL0(final long handle, private native void startTrace(final long handle, final long maxTraceFileSize, final long traceWriterHandle) throws RocksDBException; private native void endTrace(final long handle) throws RocksDBException; - + private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges, + boolean include_end) throws RocksDBException; private native static void destroyDB(final String path, final long optionsHandle) throws RocksDBException; diff --git a/java/src/main/java/org/rocksdb/SstFileReader.java b/java/src/main/java/org/rocksdb/SstFileReader.java new file mode 100644 index 00000000000..53f96e3cc5e --- /dev/null +++ b/java/src/main/java/org/rocksdb/SstFileReader.java @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public class SstFileReader extends RocksObject { + static { + RocksDB.loadLibrary(); + } + + public SstFileReader(final Options options) { + super(newSstFileReader(options.nativeHandle_)); + } + + /** + * Returns an iterator that will iterate on all keys in the default + * column family including both keys in the DB and uncommitted keys in this + * transaction. + * + * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read + * from the DB but will NOT change which keys are read from this transaction + * (the keys in this transaction do not yet belong to any snapshot and will be + * fetched regardless). + * + * Caller is responsible for deleting the returned Iterator. + * + * @param readOptions Read options. + * + * @return instance of iterator object. + */ + public SstFileReaderIterator newIterator(final ReadOptions readOptions) { + assert (isOwningHandle()); + long iter = newIterator(nativeHandle_, readOptions.nativeHandle_); + return new SstFileReaderIterator(this, iter); + } + + /** + * Prepare SstFileReader to read a file. + * + * @param filePath the location of file + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void open(final String filePath) throws RocksDBException { + open(nativeHandle_, filePath); + } + + /** + * Verify checksum + * + * @throws RocksDBException if the checksum is not valid + */ + public void verifyChecksum() throws RocksDBException { + verifyChecksum(nativeHandle_); + } + + /** + * Get the properties of the table. + * + * + * @return the properties + */ + public TableProperties getTableProperties() throws RocksDBException { + return getTableProperties(nativeHandle_); + } + + @Override protected final native void disposeInternal(final long handle); + private native long newIterator(final long handle, final long readOptionsHandle); + + private native void open(final long handle, final String filePath) throws RocksDBException; + + private native static long newSstFileReader(final long optionsHandle); + private native void verifyChecksum(final long handle) throws RocksDBException; + private native TableProperties getTableProperties(final long handle) throws RocksDBException; +} diff --git a/java/src/main/java/org/rocksdb/SstFileReaderIterator.java b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java new file mode 100644 index 00000000000..d01b7a39031 --- /dev/null +++ b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java @@ -0,0 +1,65 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + *

An iterator that yields a sequence of key/value pairs from a source. + * Multiple implementations are provided by this library. + * In particular, iterators are provided + * to access the contents of a Table or a DB.

+ * + *

Multiple threads can invoke const methods on an RocksIterator without + * external synchronization, but if any of the threads may call a + * non-const method, all threads accessing the same RocksIterator must use + * external synchronization.

+ * + * @see RocksObject + */ +public class SstFileReaderIterator extends AbstractRocksIterator { + protected SstFileReaderIterator(SstFileReader reader, long nativeHandle) { + super(reader, nativeHandle); + } + + /** + *

Return the key for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator.

+ * + *

REQUIRES: {@link #isValid()}

+ * + * @return key for the current entry. + */ + public byte[] key() { + assert (isOwningHandle()); + return key0(nativeHandle_); + } + + /** + *

Return the value for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator.

+ * + *

REQUIRES: !AtEnd() && !AtStart()

+ * @return value for the current entry. + */ + public byte[] value() { + assert (isOwningHandle()); + return value0(nativeHandle_); + } + + @Override protected final native void disposeInternal(final long handle); + @Override final native boolean isValid0(long handle); + @Override final native void seekToFirst0(long handle); + @Override final native void seekToLast0(long handle); + @Override final native void next0(long handle); + @Override final native void prev0(long handle); + @Override final native void seek0(long handle, byte[] target, int targetLen); + @Override final native void seekForPrev0(long handle, byte[] target, int targetLen); + @Override final native void status0(long handle) throws RocksDBException; + + private native byte[] key0(long handle); + private native byte[] value0(long handle); +} diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 551e366dc53..40a642bd666 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -717,6 +717,11 @@ public enum TickerType { */ TXN_SNAPSHOT_MUTEX_OVERHEAD((byte) -0x0C), + /** + * # of times ::Get returned TryAgain due to expired snapshot seq + */ + TXN_GET_TRY_AGAIN((byte) -0x0D), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java index c019266483f..b5de34b756f 100644 --- a/java/src/main/java/org/rocksdb/util/Environment.java +++ b/java/src/main/java/org/rocksdb/util/Environment.java @@ -1,9 +1,26 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. package org.rocksdb.util; +import java.io.File; +import java.io.IOException; + public class Environment { private static String OS = System.getProperty("os.name").toLowerCase(); private static String ARCH = System.getProperty("os.arch").toLowerCase(); + private static boolean MUSL_LIBC; + + static { + try { + final Process p = new ProcessBuilder("/usr/bin/env", "sh", "-c", "ldd /usr/bin/env | grep -q musl").start(); + MUSL_LIBC = p.waitFor() == 0; + } catch (final IOException | InterruptedException e) { + MUSL_LIBC = false; + } + } + + public static boolean isAarch64() { + return ARCH.contains("aarch64"); + } public static boolean isPowerPC() { return ARCH.contains("ppc"); @@ -34,6 +51,10 @@ public static boolean isUnix() { OS.contains("nux"); } + public static boolean isMuslLibc() { + return MUSL_LIBC; + } + public static boolean isSolaris() { return OS.contains("sunos"); } @@ -57,15 +78,37 @@ public static String getSharedLibraryFileName(final String name) { return appendLibOsSuffix("lib" + getSharedLibraryName(name), true); } + /** + * Get the name of the libc implementation + * + * @return the name of the implementation, + * or null if the default for that platform (e.g. glibc on Linux). + */ + public static /* @Nullable */ String getLibcName() { + if (isMuslLibc()) { + return "musl"; + } else { + return null; + } + } + + private static String getLibcPostfix() { + final String libcName = getLibcName(); + if (libcName == null) { + return ""; + } + return "-" + libcName; + } + public static String getJniLibraryName(final String name) { if (isUnix()) { final String arch = is64Bit() ? "64" : "32"; - if(isPowerPC()) { - return String.format("%sjni-linux-%s", name, ARCH); - } else if(isS390x()) { + if (isPowerPC() || isAarch64()) { + return String.format("%sjni-linux-%s%s", name, ARCH, getLibcPostfix()); + } else if (isS390x()) { return String.format("%sjni-linux%s", name, ARCH); } else { - return String.format("%sjni-linux%s", name, arch); + return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix()); } } else if (isMac()) { return String.format("%sjni-osx", name); diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java index e6ebc46cd24..1731b6c270d 100644 --- a/java/src/test/java/org/rocksdb/DBOptionsTest.java +++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java @@ -543,6 +543,15 @@ public void enablePipelinedWrite() { } } + @Test + public void unordredWrite() { + try(final DBOptions opt = new DBOptions()) { + assertThat(opt.unorderedWrite()).isFalse(); + opt.setUnorderedWrite(true); + assertThat(opt.unorderedWrite()).isTrue(); + } + } + @Test public void allowConcurrentMemtableWrite() { try (final DBOptions opt = new DBOptions()) { diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java index e27a33d7df0..04d362fb1d2 100644 --- a/java/src/test/java/org/rocksdb/OptionsTest.java +++ b/java/src/test/java/org/rocksdb/OptionsTest.java @@ -762,6 +762,15 @@ public void enablePipelinedWrite() { } } + @Test + public void unordredWrite() { + try(final Options opt = new Options()) { + assertThat(opt.unorderedWrite()).isFalse(); + opt.setUnorderedWrite(true); + assertThat(opt.unorderedWrite()).isTrue(); + } + } + @Test public void allowConcurrentMemtableWrite() { try (final Options opt = new Options()) { diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java index a7d7fee14f2..8af4dcaaa46 100644 --- a/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -869,6 +869,62 @@ public void compactRangeToLevel() } } + @Test + public void deleteFilesInRange() throws RocksDBException, InterruptedException { + final int KEY_SIZE = 20; + final int VALUE_SIZE = 1000; + final int FILE_SIZE = 64000; + final int NUM_FILES = 10; + + final int KEY_INTERVAL = 10000; + /* + * Intention of these options is to end up reliably with 10 files + * we will be deleting using deleteFilesInRange. + * It is writing roughly number of keys that will fit in 10 files (target size) + * It is writing interleaved so that files from memory on L0 will overlap + * Then compaction cleans everything and we should end up with 10 files + */ + try (final Options opt = new Options() + .setCreateIfMissing(true) + .setCompressionType(CompressionType.NO_COMPRESSION) + .setTargetFileSizeBase(FILE_SIZE) + .setWriteBufferSize(FILE_SIZE / 2) + .setDisableAutoCompactions(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + int records = FILE_SIZE / (KEY_SIZE + VALUE_SIZE); + + // fill database with key/value pairs + byte[] value = new byte[VALUE_SIZE]; + int key_init = 0; + for (int o = 0; o < NUM_FILES; ++o) { + int int_key = key_init++; + for (int i = 0; i < records; ++i) { + int_key += KEY_INTERVAL; + rand.nextBytes(value); + + db.put(String.format("%020d", int_key).getBytes(), value); + } + } + db.flush(new FlushOptions().setWaitForFlush(true)); + db.compactRange(); + // Make sure we do create one more L0 files. + assertThat(db.getProperty("rocksdb.num-files-at-level0")).isEqualTo("0"); + + // Should be 10, but we are OK with asserting +- 2 + int files = Integer.parseInt(db.getProperty("rocksdb.num-files-at-level1")); + assertThat(files).isBetween(8, 12); + + // Delete lower 60% (roughly). Result should be 5, but we are OK with asserting +- 2 + // Important is that we know something was deleted (JNI call did something) + // Exact assertions are done in C++ unit tests + db.deleteFilesInRanges(null, + Arrays.asList(null, String.format("%020d", records * KEY_INTERVAL * 6 / 10).getBytes()), + false); + files = Integer.parseInt(db.getProperty("rocksdb.num-files-at-level1")); + assertThat(files).isBetween(3, 7); + } + } + @Test public void compactRangeToLevelColumnFamily() throws RocksDBException { diff --git a/java/src/test/java/org/rocksdb/SstFileReaderTest.java b/java/src/test/java/org/rocksdb/SstFileReaderTest.java new file mode 100644 index 00000000000..c0e3a73d88b --- /dev/null +++ b/java/src/test/java/org/rocksdb/SstFileReaderTest.java @@ -0,0 +1,133 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rocksdb.util.BytewiseComparator; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class SstFileReaderTest { + private static final String SST_FILE_NAME = "test.sst"; + + class KeyValueWithOp { + KeyValueWithOp(String key, String value, OpType opType) { + this.key = key; + this.value = value; + this.opType = opType; + } + + String getKey() { + return key; + } + + String getValue() { + return value; + } + + OpType getOpType() { + return opType; + } + + private String key; + private String value; + private OpType opType; + } + + @Rule public TemporaryFolder parentFolder = new TemporaryFolder(); + + enum OpType { PUT, PUT_BYTES, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES } + + private File newSstFile(final List keyValues) + throws IOException, RocksDBException { + final EnvOptions envOptions = new EnvOptions(); + final StringAppendOperator stringAppendOperator = new StringAppendOperator(); + final Options options = new Options().setMergeOperator(stringAppendOperator); + SstFileWriter sstFileWriter; + sstFileWriter = new SstFileWriter(envOptions, options); + + final File sstFile = parentFolder.newFile(SST_FILE_NAME); + try { + sstFileWriter.open(sstFile.getAbsolutePath()); + for (KeyValueWithOp keyValue : keyValues) { + Slice keySlice = new Slice(keyValue.getKey()); + Slice valueSlice = new Slice(keyValue.getValue()); + byte[] keyBytes = keyValue.getKey().getBytes(); + byte[] valueBytes = keyValue.getValue().getBytes(); + switch (keyValue.getOpType()) { + case PUT: + sstFileWriter.put(keySlice, valueSlice); + break; + case PUT_BYTES: + sstFileWriter.put(keyBytes, valueBytes); + break; + case MERGE: + sstFileWriter.merge(keySlice, valueSlice); + break; + case MERGE_BYTES: + sstFileWriter.merge(keyBytes, valueBytes); + break; + case DELETE: + sstFileWriter.delete(keySlice); + break; + case DELETE_BYTES: + sstFileWriter.delete(keyBytes); + break; + default: + fail("Unsupported op type"); + } + keySlice.close(); + valueSlice.close(); + } + sstFileWriter.finish(); + } finally { + assertThat(sstFileWriter).isNotNull(); + sstFileWriter.close(); + options.close(); + envOptions.close(); + } + return sstFile; + } + + @Test + public void readSstFile() throws RocksDBException, IOException { + final List keyValues = new ArrayList<>(); + keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT)); + + final File sstFile = newSstFile(keyValues); + try (final StringAppendOperator stringAppendOperator = new StringAppendOperator(); + final Options options = + new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator); + final SstFileReader reader = new SstFileReader(options)) { + // Open the sst file and iterator + reader.open(sstFile.getAbsolutePath()); + final ReadOptions readOptions = new ReadOptions(); + final SstFileReaderIterator iterator = reader.newIterator(readOptions); + + // Use the iterator to read sst file + iterator.seekToFirst(); + + // Verify Checksum + reader.verifyChecksum(); + + // Verify Table Properties + assertEquals(reader.getTableProperties().getNumEntries(), 1); + + // Check key and value + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); + } + } +} diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java index 28ee04768e9..ab0ff2027a0 100644 --- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java +++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java @@ -16,14 +16,17 @@ public class EnvironmentTest { private final static String ARCH_FIELD_NAME = "ARCH"; private final static String OS_FIELD_NAME = "OS"; + private final static String MUSL_LIBC_FIELD_NAME = "MUSL_LIBC"; private static String INITIAL_OS; private static String INITIAL_ARCH; + private static boolean INITIAL_MUSL_LIBC; @BeforeClass public static void saveState() { INITIAL_ARCH = getEnvironmentClassField(ARCH_FIELD_NAME); INITIAL_OS = getEnvironmentClassField(OS_FIELD_NAME); + INITIAL_MUSL_LIBC = getEnvironmentClassField(MUSL_LIBC_FIELD_NAME); } @Test @@ -53,6 +56,7 @@ public void mac64() { @Test public void nix32() { // Linux + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); setEnvironmentClassFields("Linux", "32"); assertThat(Environment.isWindows()).isFalse(); assertThat(Environment.getJniLibraryExtension()). @@ -61,7 +65,17 @@ public void nix32() { isEqualTo("librocksdbjni-linux32.so"); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.so"); + // Linux musl-libc (Alpine) + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux32-musl.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); // UNIX + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); setEnvironmentClassFields("Unix", "32"); assertThat(Environment.isWindows()).isFalse(); assertThat(Environment.getJniLibraryExtension()). @@ -84,6 +98,7 @@ public void aix32() { @Test public void nix64() { + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); setEnvironmentClassFields("Linux", "x64"); assertThat(Environment.isWindows()).isFalse(); assertThat(Environment.getJniLibraryExtension()). @@ -92,7 +107,17 @@ public void nix64() { isEqualTo("librocksdbjni-linux64.so"); assertThat(Environment.getSharedLibraryFileName("rocksdb")). isEqualTo("librocksdbjni.so"); + // Linux musl-libc (Alpine) + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux64-musl.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); // UNIX + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); setEnvironmentClassFields("Unix", "x64"); assertThat(Environment.isWindows()).isFalse(); assertThat(Environment.getJniLibraryExtension()). @@ -130,6 +155,62 @@ public void win64() { isEqualTo("librocksdbjni.dll"); } + @Test + public void ppc64le() { + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); + setEnvironmentClassFields("Linux", "ppc64le"); + assertThat(Environment.isUnix()).isTrue(); + assertThat(Environment.isPowerPC()).isTrue(); + assertThat(Environment.is64Bit()).isTrue(); + assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so"); + assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni"); + assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le"); + assertThat(Environment.getJniLibraryFileName("rocksdb")) + .isEqualTo("librocksdbjni-linux-ppc64le.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so"); + // Linux musl-libc (Alpine) + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true); + setEnvironmentClassFields("Linux", "ppc64le"); + assertThat(Environment.isUnix()).isTrue(); + assertThat(Environment.isPowerPC()).isTrue(); + assertThat(Environment.is64Bit()).isTrue(); + assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so"); + assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni"); + assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl"); + assertThat(Environment.getJniLibraryFileName("rocksdb")) + .isEqualTo("librocksdbjni-linux-ppc64le-musl.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so"); + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); + } + + @Test + public void aarch64() { + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); + setEnvironmentClassFields("Linux", "aarch64"); + assertThat(Environment.isUnix()).isTrue(); + assertThat(Environment.isAarch64()).isTrue(); + assertThat(Environment.is64Bit()).isTrue(); + assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so"); + assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni"); + assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64"); + assertThat(Environment.getJniLibraryFileName("rocksdb")) + .isEqualTo("librocksdbjni-linux-aarch64.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so"); + // Linux musl-libc (Alpine) + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true); + setEnvironmentClassFields("Linux", "aarch64"); + assertThat(Environment.isUnix()).isTrue(); + assertThat(Environment.isAarch64()).isTrue(); + assertThat(Environment.is64Bit()).isTrue(); + assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so"); + assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni"); + assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl"); + assertThat(Environment.getJniLibraryFileName("rocksdb")) + .isEqualTo("librocksdbjni-linux-aarch64-musl.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so"); + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); + } + private void setEnvironmentClassFields(String osName, String osArch) { setEnvironmentClassField(OS_FIELD_NAME, osName); @@ -140,9 +221,10 @@ private void setEnvironmentClassFields(String osName, public static void restoreState() { setEnvironmentClassField(OS_FIELD_NAME, INITIAL_OS); setEnvironmentClassField(ARCH_FIELD_NAME, INITIAL_ARCH); + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, INITIAL_MUSL_LIBC); } - private static String getEnvironmentClassField(String fieldName) { + private static T getEnvironmentClassField(String fieldName) { final Field field; try { field = Environment.class.getDeclaredField(fieldName); @@ -150,13 +232,13 @@ private static String getEnvironmentClassField(String fieldName) { final Field modifiersField = Field.class.getDeclaredField("modifiers"); modifiersField.setAccessible(true); modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL); - return (String)field.get(null); + return (T)field.get(null); } catch (NoSuchFieldException | IllegalAccessException e) { throw new RuntimeException(e); } } - private static void setEnvironmentClassField(String fieldName, String value) { + private static void setEnvironmentClassField(String fieldName, Object value) { final Field field; try { field = Environment.class.getDeclaredField(fieldName); diff --git a/util/auto_roll_logger.cc b/logging/auto_roll_logger.cc similarity index 59% rename from util/auto_roll_logger.cc rename to logging/auto_roll_logger.cc index ae6061aed43..73a02a89957 100644 --- a/util/auto_roll_logger.cc +++ b/logging/auto_roll_logger.cc @@ -3,13 +3,54 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "util/auto_roll_logger.h" +#include "logging/auto_roll_logger.h" + +#include +#include "file/filename.h" +#include "logging/logging.h" #include "util/mutexlock.h" namespace rocksdb { #ifndef ROCKSDB_LITE // -- AutoRollLogger + +AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname, + const std::string& db_log_dir, + size_t log_max_size, + size_t log_file_time_to_roll, + size_t keep_log_file_num, + const InfoLogLevel log_level) + : Logger(log_level), + dbname_(dbname), + db_log_dir_(db_log_dir), + env_(env), + status_(Status::OK()), + kMaxLogFileSize(log_max_size), + kLogFileTimeToRoll(log_file_time_to_roll), + kKeepLogFileNum(keep_log_file_num), + cached_now(static_cast(env_->NowMicros() * 1e-6)), + ctime_(cached_now), + cached_now_access_count(0), + call_NowMicros_every_N_records_(100), + mutex_() { + Status s = env->GetAbsolutePath(dbname, &db_absolute_path_); + if (s.IsNotSupported()) { + db_absolute_path_ = dbname; + } else { + status_ = s; + } + log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); + if (env_->FileExists(log_fname_).ok()) { + RollLogFile(); + } + GetExistingFiles(); + ResetLogger(); + if (status_.ok()) { + status_ = TrimOldLogFiles(); + } +} + Status AutoRollLogger::ResetLogger() { TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger"); status_ = env_->NewLogger(log_fname_, &logger_); @@ -44,6 +85,58 @@ void AutoRollLogger::RollLogFile() { now++; } while (env_->FileExists(old_fname).ok()); env_->RenameFile(log_fname_, old_fname); + old_log_files_.push(old_fname); +} + +void AutoRollLogger::GetExistingFiles() { + { + // Empty the queue to avoid duplicated entries in the queue. + std::queue empty; + std::swap(old_log_files_, empty); + } + + std::string parent_dir; + std::vector info_log_files; + Status s = + GetInfoLogFiles(env_, db_log_dir_, dbname_, &parent_dir, &info_log_files); + if (status_.ok()) { + status_ = s; + } + // We need to sort the file before enqueing it so that when we + // delete file from the front, it is the oldest file. + std::sort(info_log_files.begin(), info_log_files.end()); + + for (const std::string& f : info_log_files) { + old_log_files_.push(parent_dir + "/" + f); + } +} + +Status AutoRollLogger::TrimOldLogFiles() { + // Here we directly list info files and delete them through Env. + // The deletion isn't going through DB, so there are shortcomes: + // 1. the deletion is not rate limited by SstFileManager + // 2. there is a chance that an I/O will be issued here + // Since it's going to be complicated to pass DB object down to + // here, we take a simple approach to keep the code easier to + // maintain. + + // old_log_files_.empty() is helpful for the corner case that + // kKeepLogFileNum == 0. We can instead check kKeepLogFileNum != 0 but + // it's essentially the same thing, and checking empty before accessing + // the queue feels safer. + while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) { + Status s = env_->DeleteFile(old_log_files_.front()); + // Remove the file from the tracking anyway. It's possible that + // DB cleaned up the old log file, or people cleaned it up manually. + old_log_files_.pop(); + // To make the file really go away, we should sync parent directory. + // Since there isn't any consistency issue involved here, skipping + // this part to avoid one I/O here. + if (!s.ok()) { + return s; + } + } + return Status::OK(); } std::string AutoRollLogger::ValistToString(const char* format, @@ -62,6 +155,11 @@ std::string AutoRollLogger::ValistToString(const char* format, void AutoRollLogger::LogInternal(const char* format, ...) { mutex_.AssertHeld(); + + if (!logger_) { + return; + } + va_list args; va_start(args, format); logger_->Logv(format, args); @@ -70,6 +168,9 @@ void AutoRollLogger::LogInternal(const char* format, ...) { void AutoRollLogger::Logv(const char* format, va_list ap) { assert(GetStatus().ok()); + if (!logger_) { + return; + } std::shared_ptr logger; { @@ -78,12 +179,19 @@ void AutoRollLogger::Logv(const char* format, va_list ap) { (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) { RollLogFile(); Status s = ResetLogger(); + Status s2 = TrimOldLogFiles(); + if (!s.ok()) { // can't really log the error if creating a new LOG file failed return; } WriteHeaderInfo(); + + if (!s2.ok()) { + ROCKS_LOG_WARN(logger.get(), "Fail to trim old info log file: %s", + s2.ToString().c_str()); + } } // pin down the current logger_ instance before releasing the mutex. @@ -107,6 +215,10 @@ void AutoRollLogger::WriteHeaderInfo() { } void AutoRollLogger::LogHeader(const char* format, va_list args) { + if (!logger_) { + return; + } + // header message are to be retained in memory. Since we cannot make any // assumptions about the data contained in va_list, we will retain them as // strings @@ -153,7 +265,8 @@ Status CreateLoggerFromOptions(const std::string& dbname, if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { AutoRollLogger* result = new AutoRollLogger( env, dbname, options.db_log_dir, options.max_log_file_size, - options.log_file_time_to_roll, options.info_log_level); + options.log_file_time_to_roll, options.keep_log_file_num, + options.info_log_level); Status s = result->GetStatus(); if (!s.ok()) { delete result; diff --git a/util/auto_roll_logger.h b/logging/auto_roll_logger.h similarity index 83% rename from util/auto_roll_logger.h rename to logging/auto_roll_logger.h index 64fce4d63e7..45cbc2697a1 100644 --- a/util/auto_roll_logger.h +++ b/logging/auto_roll_logger.h @@ -8,13 +8,14 @@ #pragma once #include +#include #include +#include "file/filename.h" #include "port/port.h" #include "port/util_logger.h" -#include "util/filename.h" +#include "test_util/sync_point.h" #include "util/mutexlock.h" -#include "util/sync_point.h" namespace rocksdb { @@ -24,25 +25,8 @@ class AutoRollLogger : public Logger { public: AutoRollLogger(Env* env, const std::string& dbname, const std::string& db_log_dir, size_t log_max_size, - size_t log_file_time_to_roll, - const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) - : Logger(log_level), - dbname_(dbname), - db_log_dir_(db_log_dir), - env_(env), - status_(Status::OK()), - kMaxLogFileSize(log_max_size), - kLogFileTimeToRoll(log_file_time_to_roll), - cached_now(static_cast(env_->NowMicros() * 1e-6)), - ctime_(cached_now), - cached_now_access_count(0), - call_NowMicros_every_N_records_(100), - mutex_() { - env->GetAbsolutePath(dbname, &db_absolute_path_); - log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_); - RollLogFile(); - ResetLogger(); - } + size_t log_file_time_to_roll, size_t keep_log_file_num, + const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL); using Logger::Logv; void Logv(const char* format, va_list ap) override; @@ -57,6 +41,10 @@ class AutoRollLogger : public Logger { } size_t GetLogFileSize() const override { + if (!logger_) { + return 0; + } + std::shared_ptr logger; { MutexLock l(&mutex_); @@ -110,6 +98,11 @@ class AutoRollLogger : public Logger { bool LogExpired(); Status ResetLogger(); void RollLogFile(); + // Read all names of old log files into old_log_files_ + // If there is any error, put the error code in status_ + void GetExistingFiles(); + // Delete old log files if it excceeds the limit. + Status TrimOldLogFiles(); // Log message to logger without rolling void LogInternal(const char* format, ...); // Serialize the va_list to a string @@ -126,8 +119,14 @@ class AutoRollLogger : public Logger { Status status_; const size_t kMaxLogFileSize; const size_t kLogFileTimeToRoll; + const size_t kKeepLogFileNum; // header information std::list headers_; + // List of all existing info log files. Used for enforcing number of + // info log files. + // Full path is stored here. It consumes signifianctly more memory + // than only storing file name. Can optimize if it causes a problem. + std::queue old_log_files_; // to avoid frequent env->NowMicros() calls, we cached the current time uint64_t cached_now; uint64_t ctime_; diff --git a/util/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc similarity index 75% rename from util/auto_roll_logger_test.cc rename to logging/auto_roll_logger_test.cc index ab9e0595808..dd279d62a25 100644 --- a/util/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -6,7 +6,7 @@ #ifndef ROCKSDB_LITE -#include "util/auto_roll_logger.h" +#include "logging/auto_roll_logger.h" #include #include #include @@ -17,11 +17,12 @@ #include #include #include +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/db.h" -#include "util/logging.h" -#include "util/sync_point.h" -#include "util/testharness.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { namespace { @@ -41,6 +42,21 @@ class NoSleepEnv : public EnvWrapper { }; } // namespace +// In this test we only want to Log some simple log message with +// no format. LogMessage() provides such a simple interface and +// avoids the [format-security] warning which occurs when you +// call ROCKS_LOG_INFO(logger, log_message) directly. +namespace { +void LogMessage(Logger* logger, const char* message) { + ROCKS_LOG_INFO(logger, "%s", message); +} + +void LogMessage(const InfoLogLevel log_level, Logger* logger, + const char* message) { + Log(log_level, logger, "%s", message); +} +} // namespace + class AutoRollLoggerTest : public testing::Test { public: static void InitTestDb() { @@ -62,6 +78,41 @@ class AutoRollLoggerTest : public testing::Test { const std::string& log_message); void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time, const std::string& log_message); + // return list of files under kTestDir that contains "LOG" + std::vector GetLogFiles() { + std::vector ret; + std::vector files; + Status s = default_env->GetChildren(kTestDir, &files); + // Should call ASSERT_OK() here but it doesn't compile. It's not + // worth the time figuring out why. + EXPECT_TRUE(s.ok()); + for (const auto& f : files) { + if (f.find("LOG") != std::string::npos) { + ret.push_back(f); + } + } + return ret; + } + + // Delete all log files under kTestDir + void CleanupLogFiles() { + for (const std::string& f : GetLogFiles()) { + ASSERT_OK(default_env->DeleteFile(kTestDir + "/" + f)); + } + } + + void RollNTimesBySize(Logger* auto_roll_logger, size_t file_num, + size_t max_log_file_size) { + // Roll the log 4 times, and it will trim to 3 files. + std::string dummy_large_string; + dummy_large_string.assign(max_log_file_size, '='); + auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + for (size_t i = 0; i < file_num + 1; i++) { + // Log enough bytes to trigger at least one roll. + LogMessage(auto_roll_logger, dummy_large_string.c_str()); + LogMessage(auto_roll_logger, ""); + } + } static const std::string kSampleMessage; static const std::string kTestDir; @@ -77,21 +128,6 @@ const std::string AutoRollLoggerTest::kLogFile( test::PerThreadDBPath("db_log_test") + "/LOG"); Env* AutoRollLoggerTest::default_env = Env::Default(); -// In this test we only want to Log some simple log message with -// no format. LogMessage() provides such a simple interface and -// avoids the [format-security] warning which occurs when you -// call ROCKS_LOG_INFO(logger, log_message) directly. -namespace { -void LogMessage(Logger* logger, const char* message) { - ROCKS_LOG_INFO(logger, "%s", message); -} - -void LogMessage(const InfoLogLevel log_level, Logger* logger, - const char* message) { - Log(log_level, logger, "%s", message); -} -} // namespace - void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size, const std::string& log_message) { @@ -159,8 +195,10 @@ void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger, TEST_F(AutoRollLoggerTest, RollLogFileBySize) { InitTestDb(); size_t log_max_size = 1024 * 5; + size_t keep_log_file_num = 10; - AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0, + keep_log_file_num); RollLogFileBySizeTest(&logger, log_max_size, kSampleMessage + ":RollLogFileBySize"); @@ -171,11 +209,12 @@ TEST_F(AutoRollLoggerTest, RollLogFileByTime) { size_t time = 2; size_t log_size = 1024 * 5; + size_t keep_log_file_num = 10; InitTestDb(); // -- Test the existence of file during the server restart. ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile)); - AutoRollLogger logger(&nse, kTestDir, "", log_size, time); + AutoRollLogger logger(&nse, kTestDir, "", log_size, time, keep_log_file_num); ASSERT_OK(default_env->FileExists(kLogFile)); RollLogFileByTimeTest(&nse, &logger, time, @@ -192,28 +231,30 @@ TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) { // treated as "singed". size_t kZero = 0; size_t log_size = 1024; + size_t keep_log_file_num = 10; - AutoRollLogger* logger = new AutoRollLogger( - Env::Default(), kTestDir, "", log_size, 0); + AutoRollLogger* logger = new AutoRollLogger(Env::Default(), kTestDir, "", + log_size, 0, keep_log_file_num); LogMessage(logger, kSampleMessage.c_str()); ASSERT_GT(logger->GetLogFileSize(), kZero); delete logger; // reopens the log file and an empty log file will be created. - logger = new AutoRollLogger( - Env::Default(), kTestDir, "", log_size, 0); + logger = new AutoRollLogger(Env::Default(), kTestDir, "", log_size, 0, 10); ASSERT_EQ(logger->GetLogFileSize(), kZero); delete logger; } TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) { size_t time = 2, log_max_size = 1024 * 5; + size_t keep_log_file_num = 10; InitTestDb(); NoSleepEnv nse(Env::Default()); - AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time); + AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time, + keep_log_file_num); // Test the ability to roll by size RollLogFileBySizeTest(&logger, log_max_size, @@ -269,6 +310,107 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { kSampleMessage + ":CreateLoggerFromOptions - both"); RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - both"); + + // Set keep_log_file_num + { + const size_t kFileNum = 3; + InitTestDb(); + options.max_log_file_size = 512; + options.log_file_time_to_roll = 2; + options.keep_log_file_num = kFileNum; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + auto_roll_logger = dynamic_cast(logger.get()); + + // Roll the log 4 times, and it will trim to 3 files. + std::string dummy_large_string; + dummy_large_string.assign(options.max_log_file_size, '='); + auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + for (size_t i = 0; i < kFileNum + 1; i++) { + // Log enough bytes to trigger at least one roll. + LogMessage(auto_roll_logger, dummy_large_string.c_str()); + LogMessage(auto_roll_logger, ""); + } + + std::vector files = GetLogFiles(); + ASSERT_EQ(kFileNum, files.size()); + + CleanupLogFiles(); + } + + // Set keep_log_file_num and dbname is different from + // db_log_dir. + { + const size_t kFileNum = 3; + InitTestDb(); + options.max_log_file_size = 512; + options.log_file_time_to_roll = 2; + options.keep_log_file_num = kFileNum; + options.db_log_dir = kTestDir; + ASSERT_OK(CreateLoggerFromOptions("/dummy/db/name", options, &logger)); + auto_roll_logger = dynamic_cast(logger.get()); + + // Roll the log 4 times, and it will trim to 3 files. + std::string dummy_large_string; + dummy_large_string.assign(options.max_log_file_size, '='); + auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + for (size_t i = 0; i < kFileNum + 1; i++) { + // Log enough bytes to trigger at least one roll. + LogMessage(auto_roll_logger, dummy_large_string.c_str()); + LogMessage(auto_roll_logger, ""); + } + + std::vector files = GetLogFiles(); + ASSERT_EQ(kFileNum, files.size()); + for (const auto& f : files) { + ASSERT_TRUE(f.find("dummy") != std::string::npos); + } + + // Cleaning up those files. + CleanupLogFiles(); + } +} + +TEST_F(AutoRollLoggerTest, AutoDeleting) { + for (int attempt = 0; attempt < 2; attempt++) { + // In the first attemp, db_log_dir is not set, while in the + // second it is set. + std::string dbname = (attempt == 0) ? kTestDir : "/test/dummy/dir"; + std::string db_log_dir = (attempt == 0) ? "" : kTestDir; + + InitTestDb(); + const size_t kMaxFileSize = 512; + { + size_t log_num = 8; + AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, + log_num); + RollNTimesBySize(&logger, log_num, kMaxFileSize); + + ASSERT_EQ(log_num, GetLogFiles().size()); + } + // Shrink number of files + { + size_t log_num = 5; + AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, + log_num); + ASSERT_EQ(log_num, GetLogFiles().size()); + + RollNTimesBySize(&logger, 3, kMaxFileSize); + ASSERT_EQ(log_num, GetLogFiles().size()); + } + + // Increase number of files again. + { + size_t log_num = 7; + AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0, + log_num); + ASSERT_EQ(6, GetLogFiles().size()); + + RollNTimesBySize(&logger, 3, kMaxFileSize); + ASSERT_EQ(log_num, GetLogFiles().size()); + } + + CleanupLogFiles(); + } } TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) { @@ -303,7 +445,7 @@ TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) { {"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin2"}}); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - flush_thread = port::Thread ([&]() { auto_roll_logger->Flush(); }); + flush_thread = port::Thread([&]() { auto_roll_logger->Flush(); }); TEST_SYNC_POINT( "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"); RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, @@ -322,7 +464,7 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) { // an extra-scope to force the AutoRollLogger to flush the log file when it // becomes out of scope. { - AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10); for (int log_level = InfoLogLevel::HEADER_LEVEL; log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { logger.SetInfoLogLevel((InfoLogLevel)log_level); @@ -360,7 +502,7 @@ TEST_F(AutoRollLoggerTest, Close) { size_t log_size = 8192; size_t log_lines = 0; - AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0); + AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10); for (int log_level = InfoLogLevel::HEADER_LEVEL; log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) { logger.SetInfoLogLevel((InfoLogLevel)log_level); @@ -416,25 +558,6 @@ static std::vector GetOldFileNames(const std::string& path) { return ret; } -// Return the number of lines where a given pattern was found in the file -static size_t GetLinesCount(const std::string& fname, - const std::string& pattern) { - std::stringstream ssbuf; - std::string line; - size_t count = 0; - - std::ifstream inFile(fname.c_str()); - ssbuf << inFile.rdbuf(); - - while (getline(ssbuf, line)) { - if (line.find(pattern) != std::string::npos) { - count++; - } - } - - return count; -} - TEST_F(AutoRollLoggerTest, LogHeaderTest) { static const size_t MAX_HEADERS = 10; static const size_t LOG_MAX_SIZE = 1024 * 5; @@ -446,8 +569,9 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { InitTestDb(); - AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "", - LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0); + AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/"", + LOG_MAX_SIZE, /*log_file_time_to_roll=*/0, + /*keep_log_file_num=*/10); if (test_num == 0) { // Log some headers explicitly using Header() @@ -485,7 +609,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { // verify that the files rolled over ASSERT_NE(oldfname, newfname); // verify that the old log contains all the header logs - ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS); + ASSERT_EQ(test::GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS); } } } @@ -511,6 +635,15 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) { delete db; } +TEST_F(AutoRollLoggerTest, FileCreateFailure) { + Options options; + options.max_log_file_size = 100 * 1024 * 1024; + options.db_log_dir = "/a/dir/does/not/exist/at/all"; + + std::shared_ptr logger; + ASSERT_NOK(CreateLoggerFromOptions("", options, &logger)); + ASSERT_TRUE(!logger); +} } // namespace rocksdb int main(int argc, char** argv) { diff --git a/logging/env_logger.h b/logging/env_logger.h new file mode 100644 index 00000000000..7e8212dd2ec --- /dev/null +++ b/logging/env_logger.h @@ -0,0 +1,165 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that uses custom Env object for logging. + +#pragma once + +#include +#include +#include +#include "port/sys_time.h" + +#include "file/writable_file_writer.h" +#include "monitoring/iostats_context_imp.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +class EnvLogger : public Logger { + public: + EnvLogger(std::unique_ptr&& writable_file, + const std::string& fname, const EnvOptions& options, Env* env, + InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) + : Logger(log_level), + file_(std::move(writable_file), fname, options, env), + last_flush_micros_(0), + env_(env), + flush_pending_(false) {} + + ~EnvLogger() { + if (!closed_) { + closed_ = true; + CloseHelper(); + } + } + + private: + void FlushLocked() { + mutex_.AssertHeld(); + if (flush_pending_) { + flush_pending_ = false; + file_.Flush(); + } + last_flush_micros_ = env_->NowMicros(); + } + + void Flush() override { + TEST_SYNC_POINT("EnvLogger::Flush:Begin1"); + TEST_SYNC_POINT("EnvLogger::Flush:Begin2"); + + MutexLock l(&mutex_); + FlushLocked(); + } + + Status CloseImpl() override { return CloseHelper(); } + + Status CloseHelper() { + mutex_.Lock(); + const auto close_status = file_.Close(); + mutex_.Unlock(); + + if (close_status.ok()) { + return close_status; + } + return Status::IOError("Close of log file failed with error:" + + (close_status.getState() + ? std::string(close_status.getState()) + : std::string())); + } + + using Logger::Logv; + void Logv(const char* format, va_list ap) override { + IOSTATS_TIMER_GUARD(logger_nanos); + + const uint64_t thread_id = env_->GetThreadID(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 65536; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + mutex_.Lock(); + // We will ignore any error returned by Append(). + file_.Append(Slice(base, p - base)); + flush_pending_ = true; + const uint64_t now_micros = env_->NowMicros(); + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + FlushLocked(); + } + mutex_.Unlock(); + if (base != buffer) { + delete[] base; + } + break; + } + } + + size_t GetLogFileSize() const override { + MutexLock l(&mutex_); + return file_.GetFileSize(); + } + + private: + WritableFileWriter file_; + mutable port::Mutex mutex_; // Mutex to protect the shared variables below. + const static uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + Env* env_; + std::atomic flush_pending_; +}; + +} // namespace rocksdb diff --git a/logging/env_logger_test.cc b/logging/env_logger_test.cc new file mode 100644 index 00000000000..6e1af2d5590 --- /dev/null +++ b/logging/env_logger_test.cc @@ -0,0 +1,162 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "logging/env_logger.h" +#include "env/mock_env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace rocksdb { + +namespace { +// In this test we only want to Log some simple log message with +// no format. +void LogMessage(std::shared_ptr logger, const std::string& message) { + Log(logger, "%s", message.c_str()); +} + +// Helper method to write the message num_times in the given logger. +void WriteLogs(std::shared_ptr logger, const std::string& message, + int num_times) { + for (int ii = 0; ii < num_times; ++ii) { + LogMessage(logger, message); + } +} + +} // namespace + +class EnvLoggerTest : public testing::Test { + public: + Env* env_; + + EnvLoggerTest() : env_(Env::Default()) {} + + ~EnvLoggerTest() = default; + + std::shared_ptr CreateLogger() { + std::shared_ptr result; + assert(NewEnvLogger(kLogFile, env_, &result).ok()); + assert(result); + result->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + return result; + } + + void DeleteLogFile() { ASSERT_OK(env_->DeleteFile(kLogFile)); } + + static const std::string kSampleMessage; + static const std::string kTestDir; + static const std::string kLogFile; +}; + +const std::string EnvLoggerTest::kSampleMessage = + "this is the message to be written to the log file!!"; +const std::string EnvLoggerTest::kLogFile = test::PerThreadDBPath("log_file"); + +TEST_F(EnvLoggerTest, EmptyLogFile) { + auto logger = CreateLogger(); + ASSERT_EQ(logger->Close(), Status::OK()); + + // Check the size of the log file. + uint64_t file_size; + ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK()); + ASSERT_EQ(file_size, 0); + DeleteLogFile(); +} + +TEST_F(EnvLoggerTest, LogMultipleLines) { + auto logger = CreateLogger(); + + // Write multiple lines. + const int kNumIter = 10; + WriteLogs(logger, kSampleMessage, kNumIter); + + // Flush the logs. + logger->Flush(); + ASSERT_EQ(logger->Close(), Status::OK()); + + // Validate whether the log file has 'kNumIter' number of lines. + ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); + DeleteLogFile(); +} + +TEST_F(EnvLoggerTest, Overwrite) { + { + auto logger = CreateLogger(); + + // Write multiple lines. + const int kNumIter = 10; + WriteLogs(logger, kSampleMessage, kNumIter); + + ASSERT_EQ(logger->Close(), Status::OK()); + + // Validate whether the log file has 'kNumIter' number of lines. + ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); + } + + // Now reopen the file again. + { + auto logger = CreateLogger(); + + // File should be empty. + uint64_t file_size; + ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK()); + ASSERT_EQ(file_size, 0); + ASSERT_EQ(logger->GetLogFileSize(), 0); + ASSERT_EQ(logger->Close(), Status::OK()); + } + DeleteLogFile(); +} + +TEST_F(EnvLoggerTest, Close) { + auto logger = CreateLogger(); + + // Write multiple lines. + const int kNumIter = 10; + WriteLogs(logger, kSampleMessage, kNumIter); + + ASSERT_EQ(logger->Close(), Status::OK()); + + // Validate whether the log file has 'kNumIter' number of lines. + ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter); + DeleteLogFile(); +} + +TEST_F(EnvLoggerTest, ConcurrentLogging) { + auto logger = CreateLogger(); + + const int kNumIter = 20; + std::function cb = [&]() { + WriteLogs(logger, kSampleMessage, kNumIter); + logger->Flush(); + }; + + // Write to the logs from multiple threads. + std::vector threads; + const int kNumThreads = 5; + // Create threads. + for (int ii = 0; ii < kNumThreads; ++ii) { + threads.push_back(port::Thread(cb)); + } + + // Wait for them to complete. + for (auto& th : threads) { + th.join(); + } + + ASSERT_EQ(logger->Close(), Status::OK()); + + // Verfiy the log file. + ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), + kNumIter * kNumThreads); + DeleteLogFile(); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/util/event_logger.cc b/logging/event_logger.cc similarity index 91% rename from util/event_logger.cc rename to logging/event_logger.cc index b488984f350..4ae9d2d66c1 100644 --- a/util/event_logger.cc +++ b/logging/event_logger.cc @@ -3,18 +3,14 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include "util/event_logger.h" +#include "logging/event_logger.h" -#include #include +#include #include #include -#include "util/logging.h" +#include "logging/logging.h" #include "util/string_util.h" namespace rocksdb { diff --git a/util/event_logger.h b/logging/event_logger.h similarity index 99% rename from util/event_logger.h rename to logging/event_logger.h index d88a6a4fe68..c3a7c30c601 100644 --- a/util/event_logger.h +++ b/logging/event_logger.h @@ -10,8 +10,8 @@ #include #include +#include "logging/log_buffer.h" #include "rocksdb/env.h" -#include "util/log_buffer.h" namespace rocksdb { diff --git a/util/event_logger_test.cc b/logging/event_logger_test.cc similarity index 94% rename from util/event_logger_test.cc rename to logging/event_logger_test.cc index 4bcf30ff5eb..cc635d42fbf 100644 --- a/util/event_logger_test.cc +++ b/logging/event_logger_test.cc @@ -5,8 +5,8 @@ #include -#include "util/event_logger.h" -#include "util/testharness.h" +#include "logging/event_logger.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/util/log_buffer.cc b/logging/log_buffer.cc similarity index 98% rename from util/log_buffer.cc rename to logging/log_buffer.cc index d09e0cb002f..74db11c66e3 100644 --- a/util/log_buffer.cc +++ b/logging/log_buffer.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/log_buffer.h" +#include "logging/log_buffer.h" #include "port/sys_time.h" #include "port/port.h" diff --git a/util/log_buffer.h b/logging/log_buffer.h similarity index 98% rename from util/log_buffer.h rename to logging/log_buffer.h index e356b93a746..16fb243117d 100644 --- a/util/log_buffer.h +++ b/logging/log_buffer.h @@ -5,11 +5,11 @@ #pragma once +#include +#include "memory/arena.h" +#include "port/sys_time.h" #include "rocksdb/env.h" -#include "util/arena.h" #include "util/autovector.h" -#include "port/sys_time.h" -#include namespace rocksdb { diff --git a/util/logging.h b/logging/logging.h similarity index 98% rename from util/logging.h rename to logging/logging.h index a4ef31bd6b5..cad90a309f1 100644 --- a/util/logging.h +++ b/logging/logging.h @@ -19,7 +19,7 @@ inline const char* RocksLogShorterFileName(const char* file) { - // 15 is the length of "util/logging.h". + // 15 is the length of "logging/logging.h". // If the name of this file changed, please change this number, too. return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0); } diff --git a/env/posix_logger.h b/logging/posix_logger.h similarity index 99% rename from env/posix_logger.h rename to logging/posix_logger.h index 401df6a3ffb..8406a6d8acc 100644 --- a/env/posix_logger.h +++ b/logging/posix_logger.h @@ -27,7 +27,7 @@ #include "env/io_posix.h" #include "monitoring/iostats_context_imp.h" #include "rocksdb/env.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/util/allocator.h b/memory/allocator.h similarity index 94% rename from util/allocator.h rename to memory/allocator.h index 505d6ba2bbf..619cd66a5fd 100644 --- a/util/allocator.h +++ b/memory/allocator.h @@ -33,6 +33,10 @@ class Allocator { class AllocTracker { public: explicit AllocTracker(WriteBufferManager* write_buffer_manager); + // No copying allowed + AllocTracker(const AllocTracker&) = delete; + void operator=(const AllocTracker&) = delete; + ~AllocTracker(); void Allocate(size_t bytes); // Call when we're finished allocating memory so we can free it from @@ -48,10 +52,6 @@ class AllocTracker { std::atomic bytes_allocated_; bool done_allocating_; bool freed_; - - // No copying allowed - AllocTracker(const AllocTracker&); - void operator=(const AllocTracker&); }; } // namespace rocksdb diff --git a/util/arena.cc b/memory/arena.cc similarity index 97% rename from util/arena.cc rename to memory/arena.cc index d7799eb266a..70c8039015b 100644 --- a/util/arena.cc +++ b/memory/arena.cc @@ -7,22 +7,16 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/arena.h" -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif +#include "memory/arena.h" #ifndef OS_WIN #include #endif #include +#include "logging/logging.h" +#include "port/malloc.h" #include "port/port.h" #include "rocksdb/env.h" -#include "util/logging.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/util/arena.h b/memory/arena.h similarity index 99% rename from util/arena.h rename to memory/arena.h index dc64154c857..fd97f57e1e5 100644 --- a/util/arena.h +++ b/memory/arena.h @@ -15,12 +15,12 @@ #ifndef OS_WIN #include #endif -#include -#include -#include #include #include -#include "util/allocator.h" +#include +#include +#include +#include "memory/allocator.h" #include "util/mutexlock.h" namespace rocksdb { diff --git a/util/arena_test.cc b/memory/arena_test.cc similarity index 99% rename from util/arena_test.cc rename to memory/arena_test.cc index 9dfc28ab2ea..18296d307d0 100644 --- a/util/arena_test.cc +++ b/memory/arena_test.cc @@ -7,9 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/arena.h" +#include "memory/arena.h" +#include "test_util/testharness.h" #include "util/random.h" -#include "util/testharness.h" namespace rocksdb { diff --git a/util/concurrent_arena.cc b/memory/concurrent_arena.cc similarity index 97% rename from util/concurrent_arena.cc rename to memory/concurrent_arena.cc index cef77d7e75f..722eb3b60bd 100644 --- a/util/concurrent_arena.cc +++ b/memory/concurrent_arena.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/concurrent_arena.h" +#include "memory/concurrent_arena.h" #include #include "port/port.h" #include "util/random.h" diff --git a/util/concurrent_arena.h b/memory/concurrent_arena.h similarity index 99% rename from util/concurrent_arena.h rename to memory/concurrent_arena.h index a6191100fd0..6b41ab02470 100644 --- a/util/concurrent_arena.h +++ b/memory/concurrent_arena.h @@ -11,9 +11,9 @@ #include #include #include +#include "memory/allocator.h" +#include "memory/arena.h" #include "port/likely.h" -#include "util/allocator.h" -#include "util/arena.h" #include "util/core_local.h" #include "util/mutexlock.h" #include "util/thread_local.h" diff --git a/util/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc similarity index 99% rename from util/jemalloc_nodump_allocator.cc rename to memory/jemalloc_nodump_allocator.cc index cdd08e932e3..1f58351bef6 100644 --- a/util/jemalloc_nodump_allocator.cc +++ b/memory/jemalloc_nodump_allocator.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/jemalloc_nodump_allocator.h" +#include "memory/jemalloc_nodump_allocator.h" #include #include diff --git a/util/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h similarity index 98% rename from util/jemalloc_nodump_allocator.h rename to memory/jemalloc_nodump_allocator.h index e93c1223778..f997a3b8120 100644 --- a/util/jemalloc_nodump_allocator.h +++ b/memory/jemalloc_nodump_allocator.h @@ -11,7 +11,6 @@ #include "port/jemalloc_helper.h" #include "port/port.h" #include "rocksdb/memory_allocator.h" -#include "util/core_local.h" #include "util/thread_local.h" #if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX) diff --git a/util/memory_allocator.h b/memory/memory_allocator.h similarity index 100% rename from util/memory_allocator.h rename to memory/memory_allocator.h diff --git a/util/memory_usage.h b/memory/memory_usage.h similarity index 100% rename from util/memory_usage.h rename to memory/memory_usage.h diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc index a1fa4938c52..ddd40aa059f 100644 --- a/memtable/alloc_tracker.cc +++ b/memtable/alloc_tracker.cc @@ -8,9 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include "memory/allocator.h" +#include "memory/arena.h" #include "rocksdb/write_buffer_manager.h" -#include "util/allocator.h" -#include "util/arena.h" namespace rocksdb { diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index 878d2338356..5c906165e0a 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -10,13 +10,13 @@ #include #include #include "db/memtable.h" +#include "memory/arena.h" #include "memtable/skiplist.h" #include "monitoring/histogram.h" #include "port/port.h" #include "rocksdb/memtablerep.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "util/arena.h" #include "util/hash.h" namespace rocksdb { @@ -218,8 +218,7 @@ class HashLinkListRep : public MemTableRep { } size_t GetHash(const Slice& slice) const { - return NPHash64(slice.data(), static_cast(slice.size()), 0) % - bucket_size_; + return fastrange64(GetSliceNPHash64(slice), bucket_size_); } Pointer* GetBucket(size_t i) const { diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc index d02919cd4ef..5c74657cd31 100644 --- a/memtable/hash_skiplist_rep.cc +++ b/memtable/hash_skiplist_rep.cc @@ -9,14 +9,14 @@ #include +#include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/skiplist.h" +#include "port/port.h" #include "rocksdb/memtablerep.h" -#include "util/arena.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "port/port.h" #include "util/murmurhash.h" -#include "db/memtable.h" -#include "memtable/skiplist.h" namespace rocksdb { namespace { diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 1ef8f2b6dbc..91ab3d75460 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -46,10 +46,10 @@ #include #include #include +#include "memory/allocator.h" #include "port/likely.h" #include "port/port.h" #include "rocksdb/slice.h" -#include "util/allocator.h" #include "util/coding.h" #include "util/random.h" @@ -74,6 +74,9 @@ class InlineSkipList { explicit InlineSkipList(Comparator cmp, Allocator* allocator, int32_t max_height = 12, int32_t branching_factor = 4); + // No copying allowed + InlineSkipList(const InlineSkipList&) = delete; + InlineSkipList& operator=(const InlineSkipList&) = delete; // Allocates a key and a skip-list node, returning a pointer to the key // portion of the node. This method is thread-safe if the allocator @@ -83,6 +86,9 @@ class InlineSkipList { // Allocate a splice using allocator. Splice* AllocateSplice(); + // Allocate a splice on heap. + Splice* AllocateSpliceOnHeap(); + // Inserts a key allocated by AllocateKey, after the actual key value // has been filled in. // @@ -102,6 +108,12 @@ class InlineSkipList { // REQUIRES: no concurrent calls to any of inserts. bool InsertWithHint(const char* key, void** hint); + // Like InsertConcurrently, but with a hint + // + // REQUIRES: nothing that compares equal to key is currently in the list. + // REQUIRES: no concurrent calls that use same hint + bool InsertWithHintConcurrently(const char* key, void** hint); + // Like Insert, but external synchronization is not required. bool InsertConcurrently(const char* key); @@ -254,10 +266,6 @@ class InlineSkipList { // lowest_level (inclusive). void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice, int recompute_level); - - // No copying allowed - InlineSkipList(const InlineSkipList&); - InlineSkipList& operator=(const InlineSkipList&); }; // Implementation details follow @@ -643,6 +651,18 @@ InlineSkipList::AllocateSplice() { return splice; } +template +typename InlineSkipList::Splice* +InlineSkipList::AllocateSpliceOnHeap() { + size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1); + char* raw = new char[sizeof(Splice) + array_size * 2]; + Splice* splice = reinterpret_cast(raw); + splice->height_ = 0; + splice->prev_ = reinterpret_cast(raw + sizeof(Splice)); + splice->next_ = reinterpret_cast(raw + sizeof(Splice) + array_size); + return splice; +} + template bool InlineSkipList::Insert(const char* key) { return Insert(key, seq_splice_, false); @@ -669,6 +689,18 @@ bool InlineSkipList::InsertWithHint(const char* key, void** hint) { return Insert(key, splice, true); } +template +bool InlineSkipList::InsertWithHintConcurrently(const char* key, + void** hint) { + assert(hint != nullptr); + Splice* splice = reinterpret_cast(*hint); + if (splice == nullptr) { + splice = AllocateSpliceOnHeap(); + *hint = reinterpret_cast(splice); + } + return Insert(key, splice, true); +} + template template void InlineSkipList::FindSpliceForLevel(const DecodedKey& key, diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index b416ef7c557..a3ae4149877 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -10,11 +10,11 @@ #include "memtable/inlineskiplist.h" #include #include +#include "memory/concurrent_arena.h" #include "rocksdb/env.h" -#include "util/concurrent_arena.h" +#include "test_util/testharness.h" #include "util/hash.h" #include "util/random.h" -#include "util/testharness.h" namespace rocksdb { @@ -412,12 +412,18 @@ class ConcurrentTest { } // REQUIRES: No concurrent calls for the same k - void ConcurrentWriteStep(uint32_t k) { + void ConcurrentWriteStep(uint32_t k, bool use_hint = false) { const int g = current_.Get(k) + 1; const Key new_key = MakeKey(k, g); char* buf = list_.AllocateKey(sizeof(Key)); memcpy(buf, &new_key, sizeof(Key)); - list_.InsertConcurrently(buf); + if (use_hint) { + void* hint = nullptr; + list_.InsertWithHintConcurrently(buf, &hint); + delete[] reinterpret_cast(hint); + } else { + list_.InsertConcurrently(buf); + } ASSERT_EQ(g, current_.Get(k) + 1); current_.Set(k, g); } @@ -508,6 +514,7 @@ TEST_F(InlineSkipTest, ConcurrentInsertWithoutThreads) { class TestState { public: ConcurrentTest t_; + bool use_hint_; int seed_; std::atomic quit_flag_; std::atomic next_writer_; @@ -575,7 +582,7 @@ static void ConcurrentReader(void* arg) { static void ConcurrentWriter(void* arg) { TestState* state = reinterpret_cast(arg); uint32_t k = state->next_writer_++ % ConcurrentTest::K; - state->t_.ConcurrentWriteStep(k); + state->t_.ConcurrentWriteStep(k, state->use_hint_); state->AdjustPendingWriters(-1); } @@ -600,7 +607,8 @@ static void RunConcurrentRead(int run) { } } -static void RunConcurrentInsert(int run, int write_parallelism = 4) { +static void RunConcurrentInsert(int run, bool use_hint = false, + int write_parallelism = 4) { Env::Default()->SetBackgroundThreads(1 + write_parallelism, Env::Priority::LOW); const int seed = test::RandomSeed() + (run * 100); @@ -612,6 +620,7 @@ static void RunConcurrentInsert(int run, int write_parallelism = 4) { fprintf(stderr, "Run %d of %d\n", i, N); } TestState state(seed + 1); + state.use_hint_ = use_hint; Env::Default()->Schedule(ConcurrentReader, &state); state.Wait(TestState::RUNNING); for (int k = 0; k < kSize; k += write_parallelism) { @@ -635,6 +644,15 @@ TEST_F(InlineSkipTest, ConcurrentRead5) { RunConcurrentRead(5); } TEST_F(InlineSkipTest, ConcurrentInsert1) { RunConcurrentInsert(1); } TEST_F(InlineSkipTest, ConcurrentInsert2) { RunConcurrentInsert(2); } TEST_F(InlineSkipTest, ConcurrentInsert3) { RunConcurrentInsert(3); } +TEST_F(InlineSkipTest, ConcurrentInsertWithHint1) { + RunConcurrentInsert(1, true); +} +TEST_F(InlineSkipTest, ConcurrentInsertWithHint2) { + RunConcurrentInsert(2, true); +} +TEST_F(InlineSkipTest, ConcurrentInsertWithHint3) { + RunConcurrentInsert(3, true); +} #endif // ROCKSDB_VALGRIND_RUN } // namespace rocksdb diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc index 51ff11a015c..1e2b5bdd1e5 100644 --- a/memtable/memtablerep_bench.cc +++ b/memtable/memtablerep_bench.cc @@ -7,10 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #ifndef GFLAGS #include int main() { @@ -28,6 +24,7 @@ int main() { #include "db/dbformat.h" #include "db/memtable.h" +#include "memory/arena.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" @@ -35,11 +32,10 @@ int main() { #include "rocksdb/options.h" #include "rocksdb/slice_transform.h" #include "rocksdb/write_buffer_manager.h" -#include "util/arena.h" +#include "test_util/testutil.h" #include "util/gflags_compat.h" #include "util/mutexlock.h" #include "util/stop_watch.h" -#include "util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::RegisterFlagValidator; diff --git a/memtable/skiplist.h b/memtable/skiplist.h index 47a89034eb9..5edfc10b7cb 100644 --- a/memtable/skiplist.h +++ b/memtable/skiplist.h @@ -32,10 +32,10 @@ #pragma once #include -#include #include +#include +#include "memory/allocator.h" #include "port/port.h" -#include "util/allocator.h" #include "util/random.h" namespace rocksdb { @@ -51,6 +51,9 @@ class SkipList { // allocator must remain allocated for the lifetime of the skiplist object. explicit SkipList(Comparator cmp, Allocator* allocator, int32_t max_height = 12, int32_t branching_factor = 4); + // No copying allowed + SkipList(const SkipList&) = delete; + void operator=(const SkipList&) = delete; // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. @@ -158,10 +161,6 @@ class SkipList { // Return the last node in the list. // Return head_ if list is empty. Node* FindLast() const; - - // No copying allowed - SkipList(const SkipList&); - void operator=(const SkipList&); }; // Implementation details follow diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc index 50c3588bb86..33cc19b2d38 100644 --- a/memtable/skiplist_test.cc +++ b/memtable/skiplist_test.cc @@ -9,11 +9,11 @@ #include "memtable/skiplist.h" #include +#include "memory/arena.h" #include "rocksdb/env.h" -#include "util/arena.h" +#include "test_util/testharness.h" #include "util/hash.h" #include "util/random.h" -#include "util/testharness.h" namespace rocksdb { diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 32870b127d2..55d3cd7a658 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -3,10 +3,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "memtable/inlineskiplist.h" #include "db/memtable.h" +#include "memory/arena.h" +#include "memtable/inlineskiplist.h" #include "rocksdb/memtablerep.h" -#include "util/arena.h" namespace rocksdb { namespace { @@ -50,6 +50,15 @@ class SkipListRep : public MemTableRep { return skip_list_.InsertWithHint(static_cast(handle), hint); } + void InsertWithHintConcurrently(KeyHandle handle, void** hint) override { + skip_list_.InsertWithHintConcurrently(static_cast(handle), hint); + } + + bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override { + return skip_list_.InsertWithHintConcurrently(static_cast(handle), + hint); + } + void InsertConcurrently(KeyHandle handle) override { skip_list_.InsertConcurrently(static_cast(handle)); } diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc index 827ab8a5d2b..e7acc94ad67 100644 --- a/memtable/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -12,8 +12,8 @@ #include #include -#include "util/arena.h" #include "db/memtable.h" +#include "memory/arena.h" #include "memtable/stl_wrappers.h" #include "port/port.h" #include "util/mutexlock.h" diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc index 3c89c8095e1..23de06a623a 100644 --- a/memtable/write_buffer_manager_test.cc +++ b/memtable/write_buffer_manager_test.cc @@ -8,7 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/write_buffer_manager.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { @@ -51,8 +51,12 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) { } TEST_F(WriteBufferManagerTest, CacheCost) { + LRUCacheOptions co; // 1GB cache - std::shared_ptr cache = NewLRUCache(1024 * 1024 * 1024, 4); + co.capacity = 1024 * 1024 * 1024; + co.num_shard_bits = 4; + co.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = NewLRUCache(co); // A write buffer manager of size 50MB std::unique_ptr wbf( new WriteBufferManager(50 * 1024 * 1024, cache)); diff --git a/monitoring/histogram.cc b/monitoring/histogram.cc index 4bc7139d304..4449ade6408 100644 --- a/monitoring/histogram.cc +++ b/monitoring/histogram.cc @@ -7,16 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include "monitoring/histogram.h" -#include -#include #include #include +#include +#include #include "port/port.h" #include "util/cast_util.h" diff --git a/monitoring/histogram_test.cc b/monitoring/histogram_test.cc index df58822fc21..ed9a7bd32ff 100644 --- a/monitoring/histogram_test.cc +++ b/monitoring/histogram_test.cc @@ -7,7 +7,7 @@ #include "monitoring/histogram.h" #include "monitoring/histogram_windowing.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/db/in_memory_stats_history.cc b/monitoring/in_memory_stats_history.cc similarity index 82% rename from db/in_memory_stats_history.cc rename to monitoring/in_memory_stats_history.cc index 39355cfbe0a..22ecde0ab6c 100644 --- a/db/in_memory_stats_history.cc +++ b/monitoring/in_memory_stats_history.cc @@ -6,8 +6,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "db/db_impl.h" -#include "db/in_memory_stats_history.h" +#include "monitoring/in_memory_stats_history.h" +#include "db/db_impl/db_impl.h" namespace rocksdb { @@ -17,6 +17,10 @@ bool InMemoryStatsHistoryIterator::Valid() const { return valid_; } Status InMemoryStatsHistoryIterator::status() const { return status_; } +// Because of garbage collection, the next stats snapshot may or may not be +// right after the current one. When reading from DBImpl::stats_history_, this +// call will be protected by DB Mutex so it will not return partial or +// corrupted results. void InMemoryStatsHistoryIterator::Next() { // increment start_time by 1 to avoid infinite loop AdvanceIteratorByTime(GetStatsTime() + 1, end_time_); diff --git a/db/in_memory_stats_history.h b/monitoring/in_memory_stats_history.h similarity index 63% rename from db/in_memory_stats_history.h rename to monitoring/in_memory_stats_history.h index 4b52e23fffa..37b50ca06de 100644 --- a/db/in_memory_stats_history.h +++ b/monitoring/in_memory_stats_history.h @@ -12,8 +12,20 @@ namespace rocksdb { +// InMemoryStatsHistoryIterator can be used to access stats history that was +// stored by an in-memory two level std::map(DBImpl::stats_history_). It keeps +// a copy of the stats snapshot (in stats_map_) that is currently being pointed +// to, which allows the iterator to access the stats snapshot even when +// the background garbage collecting thread purges it from the source of truth +// (`DBImpl::stats_history_`). In that case, the iterator will continue to be +// valid until a call to `Next()` returns no result and invalidates it. In +// some extreme cases, the iterator may also return fragmented segments of +// stats snapshots due to long gaps between `Next()` calls and interleaved +// garbage collection. class InMemoryStatsHistoryIterator final : public StatsHistoryIterator { public: + // Setup InMemoryStatsHistoryIterator to return stats snapshots between + // seconds timestamps [start_time, end_time) InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time, DBImpl* db_impl) : start_time_(start_time), @@ -22,13 +34,27 @@ class InMemoryStatsHistoryIterator final : public StatsHistoryIterator { db_impl_(db_impl) { AdvanceIteratorByTime(start_time_, end_time_); } + // no copying allowed + InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete; + void operator=(const InMemoryStatsHistoryIterator&) = delete; + InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete; + InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) = + delete; + ~InMemoryStatsHistoryIterator() override; bool Valid() const override; Status status() const override; + // Move to the next stats snapshot currently available + // This function may invalidate the iterator + // REQUIRES: Valid() void Next() override; + + // REQUIRES: Valid() uint64_t GetStatsTime() const override; + // This function is idempotent + // REQUIRES: Valid() const std::map& GetStatsMap() const override; private: @@ -36,13 +62,6 @@ class InMemoryStatsHistoryIterator final : public StatsHistoryIterator { // between [start_time, end_time) void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time); - // No copying allowed - InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete; - void operator=(const InMemoryStatsHistoryIterator&) = delete; - InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete; - InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) = - delete; - uint64_t time_; uint64_t start_time_; uint64_t end_time_; diff --git a/monitoring/instrumented_mutex.cc b/monitoring/instrumented_mutex.cc index 7b61bcf4fb8..796bb26dd4b 100644 --- a/monitoring/instrumented_mutex.cc +++ b/monitoring/instrumented_mutex.cc @@ -6,7 +6,7 @@ #include "monitoring/instrumented_mutex.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { namespace { diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc index 3d102f91203..20e0467ab88 100644 --- a/monitoring/iostats_context.cc +++ b/monitoring/iostats_context.cc @@ -54,7 +54,9 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const { IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos); IOSTATS_CONTEXT_OUTPUT(logger_nanos); - return ss.str(); + std::string str = ss.str(); + str.erase(str.find_last_not_of(", ") + 1); + return str; } } // namespace rocksdb diff --git a/monitoring/iostats_context_test.cc b/monitoring/iostats_context_test.cc index 74d3e43291d..28d305d021a 100644 --- a/monitoring/iostats_context_test.cc +++ b/monitoring/iostats_context_test.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "rocksdb/iostats_context.h" -#include "util/testharness.h" +#include "test_util/testharness.h" namespace rocksdb { diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index 40b0b215c47..5e0d5ac2544 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -529,7 +529,10 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const { PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive); PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_hit_count); PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_miss_count); - return ss.str(); + + std::string str = ss.str(); + str.erase(str.find_last_not_of(", ") + 1); + return str; #endif } diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index e0ff8afc58e..7bf62060557 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -22,12 +22,16 @@ extern thread_local PerfContext perf_context; #if defined(NPERF_CONTEXT) -#define PERF_TIMER_GUARD(metric) -#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition) -#define PERF_TIMER_MEASURE(metric) #define PERF_TIMER_STOP(metric) #define PERF_TIMER_START(metric) +#define PERF_TIMER_GUARD(metric) +#define PERF_TIMER_GUARD_WITH_ENV(metric, env) +#define PERF_CPU_TIMER_GUARD(metric, env) +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ + ticker_type) +#define PERF_TIMER_MEASURE(metric) #define PERF_COUNTER_ADD(metric, value) +#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) #else diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc new file mode 100644 index 00000000000..74f7303648f --- /dev/null +++ b/monitoring/persistent_stats_history.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "monitoring/persistent_stats_history.h" + +#include +#include +#include +#include "db/db_impl/db_impl.h" +#include "port/likely.h" +#include "util/string_util.h" + +namespace rocksdb { +// 10 digit seconds timestamp => [Sep 9, 2001 ~ Nov 20, 2286] +const int kNowSecondsStringLength = 10; +const std::string kFormatVersionKeyString = + "__persistent_stats_format_version__"; +const std::string kCompatibleVersionKeyString = + "__persistent_stats_compatible_version__"; +// Every release maintains two versions numbers for persistents stats: Current +// format version and compatible format version. Current format version +// designates what type of encoding will be used when writing to stats CF; +// compatible format version designates the minimum format version that +// can decode the stats CF encoded using the current format version. +const uint64_t kStatsCFCurrentFormatVersion = 1; +const uint64_t kStatsCFCompatibleFormatVersion = 1; + +Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type, + uint64_t* version_number) { + if (type >= StatsVersionKeyType::kKeyTypeMax) { + return Status::InvalidArgument("Invalid stats version key type provided"); + } + std::string key; + if (type == StatsVersionKeyType::kFormatVersion) { + key = kFormatVersionKeyString; + } else if (type == StatsVersionKeyType::kCompatibleVersion) { + key = kCompatibleVersionKeyString; + } + ReadOptions options; + options.verify_checksums = true; + std::string result; + Status s = db->Get(options, db->PersistentStatsColumnFamily(), key, &result); + if (!s.ok() || result.empty()) { + return Status::NotFound("Persistent stats version key " + key + + " not found."); + } + + // read version_number but do nothing in current version + *version_number = ParseUint64(result); + return Status::OK(); +} + +int EncodePersistentStatsKey(uint64_t now_seconds, const std::string& key, + int size, char* buf) { + char timestamp[kNowSecondsStringLength + 1]; + // make time stamp string equal in length to allow sorting by time + snprintf(timestamp, sizeof(timestamp), "%010d", + static_cast(now_seconds)); + timestamp[kNowSecondsStringLength] = '\0'; + return snprintf(buf, size, "%s#%s", timestamp, key.c_str()); +} + +void OptimizeForPersistentStats(ColumnFamilyOptions* cfo) { + cfo->write_buffer_size = 2 << 20; + cfo->target_file_size_base = 2 * 1048576; + cfo->max_bytes_for_level_base = 10 * 1048576; + cfo->soft_pending_compaction_bytes_limit = 256 * 1048576; + cfo->hard_pending_compaction_bytes_limit = 1073741824ul; + cfo->compression = kNoCompression; +} + +PersistentStatsHistoryIterator::~PersistentStatsHistoryIterator() {} + +bool PersistentStatsHistoryIterator::Valid() const { return valid_; } + +Status PersistentStatsHistoryIterator::status() const { return status_; } + +void PersistentStatsHistoryIterator::Next() { + // increment start_time by 1 to avoid infinite loop + AdvanceIteratorByTime(GetStatsTime() + 1, end_time_); +} + +uint64_t PersistentStatsHistoryIterator::GetStatsTime() const { return time_; } + +const std::map& +PersistentStatsHistoryIterator::GetStatsMap() const { + return stats_map_; +} + +std::pair parseKey(const Slice& key, + uint64_t start_time) { + std::pair result; + std::string key_str = key.ToString(); + std::string::size_type pos = key_str.find("#"); + // TODO(Zhongyi): add counters to track parse failures? + if (pos == std::string::npos) { + result.first = port::kMaxUint64; + result.second.clear(); + } else { + uint64_t parsed_time = ParseUint64(key_str.substr(0, pos)); + // skip entries with timestamp smaller than start_time + if (parsed_time < start_time) { + result.first = port::kMaxUint64; + result.second = ""; + } else { + result.first = parsed_time; + std::string key_resize = key_str.substr(pos + 1); + result.second = key_resize; + } + } + return result; +} + +// advance the iterator to the next time between [start_time, end_time) +// if success, update time_ and stats_map_ with new_time and stats_map +void PersistentStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time, + uint64_t end_time) { + // try to find next entry in stats_history_ map + if (db_impl_ != nullptr) { + ReadOptions ro; + Iterator* iter = + db_impl_->NewIterator(ro, db_impl_->PersistentStatsColumnFamily()); + + char timestamp[kNowSecondsStringLength + 1]; + snprintf(timestamp, sizeof(timestamp), "%010d", + static_cast(std::max(time_, start_time))); + timestamp[kNowSecondsStringLength] = '\0'; + + iter->Seek(timestamp); + // no more entries with timestamp >= start_time is found or version key + // is found to be incompatible + if (!iter->Valid()) { + valid_ = false; + delete iter; + return; + } + time_ = parseKey(iter->key(), start_time).first; + valid_ = true; + // check parsed time and invalid if it exceeds end_time + if (time_ > end_time) { + valid_ = false; + delete iter; + return; + } + // find all entries with timestamp equal to time_ + std::map new_stats_map; + std::pair kv; + for (; iter->Valid(); iter->Next()) { + kv = parseKey(iter->key(), start_time); + if (kv.first != time_) { + break; + } + if (kv.second.compare(kFormatVersionKeyString) == 0) { + continue; + } + new_stats_map[kv.second] = ParseUint64(iter->value().ToString()); + } + stats_map_.swap(new_stats_map); + delete iter; + } else { + valid_ = false; + } +} + +} // namespace rocksdb diff --git a/monitoring/persistent_stats_history.h b/monitoring/persistent_stats_history.h new file mode 100644 index 00000000000..9a6885987fd --- /dev/null +++ b/monitoring/persistent_stats_history.h @@ -0,0 +1,83 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "db/db_impl/db_impl.h" +#include "rocksdb/stats_history.h" + +namespace rocksdb { + +extern const std::string kFormatVersionKeyString; +extern const std::string kCompatibleVersionKeyString; +extern const uint64_t kStatsCFCurrentFormatVersion; +extern const uint64_t kStatsCFCompatibleFormatVersion; + +enum StatsVersionKeyType : uint32_t { + kFormatVersion = 1, + kCompatibleVersion = 2, + kKeyTypeMax = 3 +}; + +// Read the version number from persitent stats cf depending on type provided +// stores the version number in `*version_number` +// returns Status::OK() on success, or other status code on failure +Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type, + uint64_t* version_number); + +// Encode timestamp and stats key into buf +// Format: timestamp(10 digit) + '#' + key +// Total length of encoded key will be capped at 100 bytes +int EncodePersistentStatsKey(uint64_t timestamp, const std::string& key, + int size, char* buf); + +void OptimizeForPersistentStats(ColumnFamilyOptions* cfo); + +class PersistentStatsHistoryIterator final : public StatsHistoryIterator { + public: + PersistentStatsHistoryIterator(uint64_t start_time, uint64_t end_time, + DBImpl* db_impl) + : time_(0), + start_time_(start_time), + end_time_(end_time), + valid_(true), + db_impl_(db_impl) { + AdvanceIteratorByTime(start_time_, end_time_); + } + ~PersistentStatsHistoryIterator() override; + bool Valid() const override; + Status status() const override; + + void Next() override; + uint64_t GetStatsTime() const override; + + const std::map& GetStatsMap() const override; + + private: + // advance the iterator to the next stats history record with timestamp + // between [start_time, end_time) + void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time); + + // No copying allowed + PersistentStatsHistoryIterator(const PersistentStatsHistoryIterator&) = + delete; + void operator=(const PersistentStatsHistoryIterator&) = delete; + PersistentStatsHistoryIterator(PersistentStatsHistoryIterator&&) = delete; + PersistentStatsHistoryIterator& operator=(PersistentStatsHistoryIterator&&) = + delete; + + uint64_t time_; + uint64_t start_time_; + uint64_t end_time_; + std::map stats_map_; + Status status_; + bool valid_; + DBImpl* db_impl_; +}; + +} // namespace rocksdb diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index fe2f2e25af3..6942fc579f8 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -5,15 +5,11 @@ // #include "monitoring/statistics.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include -#include "rocksdb/statistics.h" -#include "port/likely.h" #include +#include #include +#include "port/likely.h" +#include "rocksdb/statistics.h" namespace rocksdb { @@ -166,6 +162,7 @@ const std::vector> TickersNameMap = { "rocksdb.txn.overhead.mutex.old.commit.map"}, {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"}, {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"}, + {TXN_GET_TRY_AGAIN, "rocksdb.txn.get.tryagain"}, {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"}, {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"}, {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"}, diff --git a/monitoring/statistics_test.cc b/monitoring/statistics_test.cc index a77022bfb3d..162afb264b2 100644 --- a/monitoring/statistics_test.cc +++ b/monitoring/statistics_test.cc @@ -5,8 +5,8 @@ // #include "port/stack_trace.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "rocksdb/statistics.h" diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc new file mode 100644 index 00000000000..f1ec3391132 --- /dev/null +++ b/monitoring/stats_history_test.cc @@ -0,0 +1,653 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include + +#include "db/column_family.h" +#include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" +#include "monitoring/persistent_stats_history.h" +#include "options/options_helper.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/convenience.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/stats_history.h" +#include "test_util/sync_point.h" +#include "test_util/testutil.h" +#include "util/random.h" + +namespace rocksdb { + +class StatsHistoryTest : public DBTestBase { + public: + StatsHistoryTest() : DBTestBase("/stats_history_test") {} +}; +#ifndef ROCKSDB_LITE + +TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) { + Options options; + options.create_if_missing = true; + options.stats_dump_period_sec = 5; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + int counter = 0; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); +#endif // OS_MACOSX && !NDEBUG + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DumpStats:1", [&](void* /*arg*/) { counter++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec); + dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); }); + ASSERT_GE(counter, 1); + + // Test cacel job through SetOptions + ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}})); + int old_val = counter; + for (int i = 6; i < 20; ++i) { + dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); }); + } + ASSERT_EQ(counter, old_val); + Close(); +} + +// Test persistent stats background thread scheduling and cancelling +TEST_F(StatsHistoryTest, StatsPersistScheduling) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); +#endif // OS_MACOSX && !NDEBUG + int counter = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec); + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + ASSERT_GE(counter, 1); + + // Test cacel job through SetOptions + ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled()); + ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); + ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled()); + Close(); +} + +// Test enabling persistent stats for the first time +TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 0; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); +#endif // OS_MACOSX && !NDEBUG + int counter = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); + ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}})); + ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec); + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + ASSERT_GE(counter, 1); + Close(); +} + +// TODO(Zhongyi): Move persistent stats related tests to a separate file +TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +#endif // OS_MACOSX && !NDEBUG + + CreateColumnFamilies({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + int mock_time = 1; + // Wait for stats persist to finish + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 6 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + // disabled stats snapshots + ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); + size_t stats_count = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + auto stats_map = stats_iter->GetStatsMap(); + ASSERT_EQ(stats_iter->GetStatsTime(), 5); + stats_count += stats_map.size(); + } + ASSERT_GT(stats_count, 0); + // Wait a bit and verify no more stats are found + for (mock_time = 6; mock_time < 20; ++mock_time) { + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(mock_time); }); + } + db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count_new = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + stats_count_new += stats_iter->GetStatsMap().size(); + } + ASSERT_EQ(stats_count_new, stats_count); + Close(); +} + +TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { + Options options; + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.stats_persist_period_sec = 1; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); +#if defined(OS_MACOSX) && !defined(NDEBUG) + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_env->RealNowMicros()) { + *reinterpret_cast(arg) = mock_env->RealNowMicros() + 1000; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +#endif // OS_MACOSX && !NDEBUG + + CreateColumnFamilies({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + // some random operation to populate statistics + ASSERT_OK(Delete("foo")); + ASSERT_OK(Put("sol", "sol")); + ASSERT_OK(Put("epic", "epic")); + ASSERT_OK(Put("ltd", "ltd")); + ASSERT_EQ("sol", Get("sol")); + ASSERT_EQ("epic", Get("epic")); + ASSERT_EQ("ltd", Get("ltd")); + Iterator* iterator = db_->NewIterator(ReadOptions()); + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + ASSERT_TRUE(iterator->key() == iterator->value()); + } + delete iterator; + ASSERT_OK(Flush()); + ASSERT_OK(Delete("sol")); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + int mock_time = 1; + // Wait for stats persist to finish + for (; mock_time < 5; ++mock_time) { + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(mock_time); }); + } + + // second round of ops + ASSERT_OK(Put("saigon", "saigon")); + ASSERT_OK(Put("noodle talk", "noodle talk")); + ASSERT_OK(Put("ping bistro", "ping bistro")); + iterator = db_->NewIterator(ReadOptions()); + for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { + ASSERT_TRUE(iterator->key() == iterator->value()); + } + delete iterator; + ASSERT_OK(Flush()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + for (; mock_time < 10; ++mock_time) { + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(mock_time); }); + } + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 10 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count = 0; + int slice_count = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + slice_count++; + auto stats_map = stats_iter->GetStatsMap(); + stats_count += stats_map.size(); + } + size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize(); + ASSERT_GE(slice_count, 9); + ASSERT_GE(stats_history_size, 12000); + // capping memory cost at 12000 bytes since one slice is around 10000~12000 + ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}})); + ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size); + // Wait for stats persist to finish + for (; mock_time < 20; ++mock_time) { + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(mock_time); }); + } + db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count_reopen = 0; + slice_count = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + slice_count++; + auto stats_map = stats_iter->GetStatsMap(); + stats_count_reopen += stats_map.size(); + } + size_t stats_history_size_reopen = + dbfull()->TEST_EstimateInMemoryStatsHistorySize(); + // only one slice can fit under the new stats_history_buffer_size + ASSERT_LT(slice_count, 2); + ASSERT_TRUE(stats_history_size_reopen < 12000 && + stats_history_size_reopen > 0); + ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0); + Close(); + // TODO: may also want to verify stats timestamp to make sure we are purging + // the correct stats snapshot +} + +int countkeys(Iterator* iter) { + int count = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + count++; + } + return count; +} + +TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + options.persist_stats_to_disk = true; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + CreateColumnFamilies({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(Get("foo"), "bar"); + + // Wait for stats persist to finish + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + auto iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + int key_count1 = countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(10); }); + iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + int key_count2 = countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(15); }); + iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + int key_count3 = countkeys(iter); + delete iter; + ASSERT_GE(key_count2, key_count1); + ASSERT_GE(key_count3, key_count2); + ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1); + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count = 0; + int slice_count = 0; + int non_zero_count = 0; + for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) { + slice_count++; + auto stats_map = stats_iter->GetStatsMap(); + ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + for (auto& stat : stats_map) { + if (stat.second != 0) { + non_zero_count++; + } + } + stats_count += stats_map.size(); + } + ASSERT_EQ(slice_count, 3); + // 2 extra keys for format version + ASSERT_EQ(stats_count, key_count3 - 2); + // verify reopen will not cause data loss + ReopenWithColumnFamilies({"default", "pikachu"}, options); + db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + size_t stats_count_reopen = 0; + int slice_count_reopen = 0; + int non_zero_count_recover = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + slice_count_reopen++; + auto stats_map = stats_iter->GetStatsMap(); + for (auto& stat : stats_map) { + if (stat.second != 0) { + non_zero_count_recover++; + } + } + stats_count_reopen += stats_map.size(); + } + ASSERT_EQ(non_zero_count, non_zero_count_recover); + ASSERT_EQ(slice_count, slice_count_reopen); + ASSERT_EQ(stats_count, stats_count_reopen); + Close(); +} + +// Test persisted stats matches the value found in options.statistics and +// the stats value retains after DB reopen +TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + options.persist_stats_to_disk = true; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + std::map stats_map_before; + ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_before)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + CreateColumnFamilies({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(Get("foo"), "bar"); + + // Wait for stats persist to finish + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + auto iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(10); }); + iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(15); }); + iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + countkeys(iter); + delete iter; + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(20); }); + + std::map stats_map_after; + ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after)); + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + std::string sample = "rocksdb.num.iterator.deleted"; + uint64_t recovered_value = 0; + for (int i = 1; stats_iter->Valid(); stats_iter->Next(), ++i) { + auto stats_map = stats_iter->GetStatsMap(); + ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + for (const auto& stat : stats_map) { + if (sample.compare(stat.first) == 0) { + recovered_value += stat.second; + } + } + } + ASSERT_EQ(recovered_value, stats_map_after[sample]); + + // test stats value retains after recovery + ReopenWithColumnFamilies({"default", "pikachu"}, options); + db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + uint64_t new_recovered_value = 0; + for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) { + auto stats_map = stats_iter->GetStatsMap(); + ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i); + for (const auto& stat : stats_map) { + if (sample.compare(stat.first) == 0) { + new_recovered_value += stat.second; + } + } + } + ASSERT_EQ(recovered_value, new_recovered_value); + + // TODO(Zhongyi): also add test to read raw values from disk and verify + // correctness + Close(); +} + +// TODO(Zhongyi): add test for different format versions + +TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) { + Options options; + options.create_if_missing = true; + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + options.persist_stats_to_disk = true; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + ASSERT_OK(TryReopen(options)); + CreateColumnFamilies({"one", "two", "three"}, options); + ASSERT_OK(Put(1, "foo", "bar")); + ReopenWithColumnFamilies({"default", "one", "two", "three"}, options); + ASSERT_EQ(Get(2, "foo"), "bar"); + CreateColumnFamilies({"four"}, options); + ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options); + ASSERT_EQ(Get(2, "foo"), "bar"); + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + auto iter = + db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); + int key_count = countkeys(iter); + delete iter; + ASSERT_GE(key_count, 0); + uint64_t num_write_wal = 0; + std::string sample = "rocksdb.write.wal"; + std::unique_ptr stats_iter; + db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + for (; stats_iter->Valid(); stats_iter->Next()) { + auto stats_map = stats_iter->GetStatsMap(); + for (const auto& stat : stats_map) { + if (sample.compare(stat.first) == 0) { + num_write_wal += stat.second; + } + } + } + stats_iter.reset(); + ASSERT_EQ(num_write_wal, 2); + + options.persist_stats_to_disk = false; + ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options); + int cf_count = 0; + for (auto cfd : *dbfull()->versions_->GetColumnFamilySet()) { + (void)cfd; + cf_count++; + } + // persistent stats cf will be implicitly opened even if + // persist_stats_to_disk is false + ASSERT_EQ(cf_count, 6); + ASSERT_EQ(Get(2, "foo"), "bar"); + + // attempt to create column family using same name, should fail + ColumnFamilyOptions cf_opts(options); + ColumnFamilyHandle* handle; + ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName, + &handle)); + + options.persist_stats_to_disk = true; + ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options); + ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName, + &handle)); + // verify stats is not affected by prior failed CF creation + db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter); + ASSERT_TRUE(stats_iter != nullptr); + num_write_wal = 0; + for (; stats_iter->Valid(); stats_iter->Next()) { + auto stats_map = stats_iter->GetStatsMap(); + for (const auto& stat : stats_map) { + if (sample.compare(stat.first) == 0) { + num_write_wal += stat.second; + } + } + } + ASSERT_EQ(num_write_wal, 2); + + Close(); + Destroy(options); +} + +TEST_F(StatsHistoryTest, PersistentStatsReadOnly) { + ASSERT_OK(Put("bar", "v2")); + Close(); + + auto options = CurrentOptions(); + options.stats_persist_period_sec = 5; + options.persist_stats_to_disk = true; + assert(options.env == env_); + ASSERT_OK(ReadOnlyReopen(options)); + ASSERT_EQ("v2", Get("bar")); + Close(); + + // Reopen and flush memtable. + ASSERT_OK(TryReopen(options)); + Flush(); + Close(); + // Now check keys in read only mode. + ASSERT_OK(ReadOnlyReopen(options)); +} + +TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { + Options options; + options.create_if_missing = true; + options.write_buffer_size = 1024 * 1024 * 10; // 10 Mb + options.stats_persist_period_sec = 5; + options.statistics = rocksdb::CreateDBStatistics(); + options.persist_stats_to_disk = true; + std::unique_ptr mock_env; + mock_env.reset(new rocksdb::MockTimeEnv(env_)); + mock_env->set_current_time(0); // in seconds + options.env = mock_env.get(); + CreateColumnFamilies({"pikachu"}, options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ColumnFamilyData* cfd_default = + static_cast(dbfull()->DefaultColumnFamily()) + ->cfd(); + ColumnFamilyData* cfd_stats = static_cast( + dbfull()->PersistentStatsColumnFamily()) + ->cfd(); + ColumnFamilyData* cfd_test = + static_cast(handles_[1])->cfd(); + + ASSERT_OK(Put("foo", "v0")); + ASSERT_OK(Put("bar", "v0")); + ASSERT_EQ("v0", Get("bar")); + ASSERT_EQ("v0", Get("foo")); + ASSERT_OK(Put(1, "Eevee", "v0")); + ASSERT_EQ("v0", Get(1, "Eevee")); + dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); }); + // writing to all three cf, flush default cf + // LogNumbers: default: 14, stats: 4, pikachu: 4 + ASSERT_OK(Flush()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_LT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); + + ASSERT_OK(Put("foo1", "v1")); + ASSERT_OK(Put("bar1", "v1")); + ASSERT_EQ("v1", Get("bar1")); + ASSERT_EQ("v1", Get("foo1")); + ASSERT_OK(Put(1, "Vaporeon", "v1")); + ASSERT_EQ("v1", Get(1, "Vaporeon")); + // writing to default and test cf, flush test cf + // LogNumbers: default: 14, stats: 16, pikachu: 16 + ASSERT_OK(Flush(1)); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_GT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); + + ASSERT_OK(Put("foo2", "v2")); + ASSERT_OK(Put("bar2", "v2")); + ASSERT_EQ("v2", Get("bar2")); + ASSERT_EQ("v2", Get("foo2")); + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(10); }); + // writing to default and stats cf, flushing default cf + // LogNumbers: default: 19, stats: 19, pikachu: 19 + ASSERT_OK(Flush()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); + + ASSERT_OK(Put("foo3", "v3")); + ASSERT_OK(Put("bar3", "v3")); + ASSERT_EQ("v3", Get("bar3")); + ASSERT_EQ("v3", Get("foo3")); + ASSERT_OK(Put(1, "Jolteon", "v3")); + ASSERT_EQ("v3", Get(1, "Jolteon")); + dbfull()->TEST_WaitForPersistStatsRun( + [&] { mock_env->set_current_time(15); }); + // writing to all three cf, flushing test cf + // LogNumbers: default: 19, stats: 19, pikachu: 22 + ASSERT_OK(Flush(1)); + ASSERT_LT(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber()); + ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber()); + Close(); +} + +#endif // !ROCKSDB_LITE +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/options/cf_options.cc b/options/cf_options.cc index f7af3f834c9..ef06eaf15bb 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -5,19 +5,15 @@ #include "options/cf_options.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include +#include #include #include #include "options/db_options.h" #include "port/port.h" +#include "rocksdb/concurrent_task_limiter.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "rocksdb/concurrent_task_limiter.h" namespace rocksdb { @@ -37,6 +33,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options, cf_options.min_write_buffer_number_to_merge), max_write_buffer_number_to_maintain( cf_options.max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain( + cf_options.max_write_buffer_size_to_maintain), inplace_update_support(cf_options.inplace_update_support), inplace_callback(cf_options.inplace_callback), info_log(db_options.info_log.get()), @@ -169,8 +167,6 @@ void MutableCFOptions::Dump(Logger* log) const { target_file_size_multiplier); ROCKS_LOG_INFO(log, " max_bytes_for_level_base: %" PRIu64, max_bytes_for_level_base); - ROCKS_LOG_INFO(log, " snap_refresh_nanos: %" PRIu64, - snap_refresh_nanos); ROCKS_LOG_INFO(log, " max_bytes_for_level_multiplier: %f", max_bytes_for_level_multiplier); ROCKS_LOG_INFO(log, " ttl: %" PRIu64, diff --git a/options/cf_options.h b/options/cf_options.h index 47fca58fa7d..3a6be638167 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -20,7 +20,6 @@ namespace rocksdb { // of DB. Raw pointers defined in this struct do not have ownership to the data // they point to. Options contains std::shared_ptr to these data. struct ImmutableCFOptions { - ImmutableCFOptions(); explicit ImmutableCFOptions(const Options& options); ImmutableCFOptions(const ImmutableDBOptions& db_options, @@ -43,6 +42,8 @@ struct ImmutableCFOptions { int max_write_buffer_number_to_maintain; + int64_t max_write_buffer_size_to_maintain; + bool inplace_update_support; UpdateStatus (*inplace_callback)(char* existing_value, @@ -149,7 +150,6 @@ struct MutableCFOptions { target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), max_bytes_for_level_base(options.max_bytes_for_level_base), - snap_refresh_nanos(options.snap_refresh_nanos), max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), ttl(options.ttl), periodic_compaction_seconds(options.periodic_compaction_seconds), @@ -186,7 +186,6 @@ struct MutableCFOptions { target_file_size_base(0), target_file_size_multiplier(0), max_bytes_for_level_base(0), - snap_refresh_nanos(0), max_bytes_for_level_multiplier(0), ttl(0), periodic_compaction_seconds(0), @@ -238,7 +237,6 @@ struct MutableCFOptions { uint64_t target_file_size_base; int target_file_size_multiplier; uint64_t max_bytes_for_level_base; - uint64_t snap_refresh_nanos; double max_bytes_for_level_multiplier; uint64_t ttl; uint64_t periodic_compaction_seconds; diff --git a/options/db_options.cc b/options/db_options.cc index 83f1a18b042..ca2800d0784 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -5,18 +5,14 @@ #include "options/db_options.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include +#include "logging/logging.h" #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/env.h" #include "rocksdb/sst_file_manager.h" #include "rocksdb/wal_filter.h" -#include "util/logging.h" namespace rocksdb { @@ -48,6 +44,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) table_cache_numshardbits(options.table_cache_numshardbits), wal_ttl_seconds(options.WAL_ttl_seconds), wal_size_limit_mb(options.WAL_size_limit_MB), + max_write_batch_group_size_bytes( + options.max_write_batch_group_size_bytes), manifest_preallocation_size(options.manifest_preallocation_size), allow_mmap_reads(options.allow_mmap_reads), allow_mmap_writes(options.allow_mmap_writes), @@ -67,6 +65,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) listeners(options.listeners), enable_thread_tracking(options.enable_thread_tracking), enable_pipelined_write(options.enable_pipelined_write), + unordered_write(options.unordered_write), allow_concurrent_memtable_write(options.allow_concurrent_memtable_write), enable_write_thread_adaptive_yield( options.enable_write_thread_adaptive_yield), @@ -87,7 +86,10 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) two_write_queues(options.two_write_queues), manual_wal_flush(options.manual_wal_flush), atomic_flush(options.atomic_flush), - avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io) { + avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io), + persist_stats_to_disk(options.persist_stats_to_disk), + write_dbid_to_manifest(options.write_dbid_to_manifest), + log_readahead_size(options.log_readahead_size) { } void ImmutableDBOptions::Dump(Logger* log) const { @@ -153,6 +155,10 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.WAL_size_limit_MB: %" PRIu64, wal_size_limit_mb); + ROCKS_LOG_HEADER(log, + " " + "Options.max_write_batch_group_size_bytes: %" PRIu64, + max_write_batch_group_size_bytes); ROCKS_LOG_HEADER( log, " Options.manifest_preallocation_size: %" ROCKSDB_PRIszt, manifest_preallocation_size); @@ -185,6 +191,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { enable_thread_tracking); ROCKS_LOG_HEADER(log, " Options.enable_pipelined_write: %d", enable_pipelined_write); + ROCKS_LOG_HEADER(log, " Options.unordered_write: %d", + unordered_write); ROCKS_LOG_HEADER(log, " Options.allow_concurrent_memtable_write: %d", allow_concurrent_memtable_write); ROCKS_LOG_HEADER(log, " Options.enable_write_thread_adaptive_yield: %d", @@ -223,6 +231,13 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.avoid_unnecessary_blocking_io: %d", avoid_unnecessary_blocking_io); + ROCKS_LOG_HEADER(log, " Options.persist_stats_to_disk: %u", + persist_stats_to_disk); + ROCKS_LOG_HEADER(log, " Options.write_dbid_to_manifest: %d", + write_dbid_to_manifest); + ROCKS_LOG_HEADER( + log, " Options.log_readahead_size: %" ROCKSDB_PRIszt, + log_readahead_size); } MutableDBOptions::MutableDBOptions() diff --git a/options/db_options.h b/options/db_options.h index 8d02003623e..7c71b12a0cc 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -43,6 +43,7 @@ struct ImmutableDBOptions { int table_cache_numshardbits; uint64_t wal_ttl_seconds; uint64_t wal_size_limit_mb; + uint64_t max_write_batch_group_size_bytes; size_t manifest_preallocation_size; bool allow_mmap_reads; bool allow_mmap_writes; @@ -60,6 +61,7 @@ struct ImmutableDBOptions { std::vector> listeners; bool enable_thread_tracking; bool enable_pipelined_write; + bool unordered_write; bool allow_concurrent_memtable_write; bool enable_write_thread_adaptive_yield; uint64_t write_thread_max_yield_usec; @@ -80,6 +82,9 @@ struct ImmutableDBOptions { bool manual_wal_flush; bool atomic_flush; bool avoid_unnecessary_blocking_io; + bool persist_stats_to_disk; + bool write_dbid_to_manifest; + size_t log_readahead_size; }; struct MutableDBOptions { diff --git a/options/options.cc b/options/options.cc index 900510d01b6..db1cd7130bf 100644 --- a/options/options.cc +++ b/options/options.cc @@ -9,11 +9,7 @@ #include "rocksdb/options.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include +#include #include #include "monitoring/statistics.h" @@ -31,7 +27,7 @@ #include "rocksdb/table.h" #include "rocksdb/table_properties.h" #include "rocksdb/wal_filter.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "util/compression.h" namespace rocksdb { @@ -46,6 +42,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) options.min_write_buffer_number_to_merge), max_write_buffer_number_to_maintain( options.max_write_buffer_number_to_maintain), + max_write_buffer_size_to_maintain( + options.max_write_buffer_size_to_maintain), inplace_update_support(options.inplace_update_support), inplace_update_num_locks(options.inplace_update_num_locks), inplace_callback(options.inplace_callback), @@ -162,6 +160,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { min_write_buffer_number_to_merge); ROCKS_LOG_HEADER(log, " Options.max_write_buffer_number_to_maintain: %d", max_write_buffer_number_to_maintain); + ROCKS_LOG_HEADER(log, + " Options.max_write_buffer_size_to_maintain: %" PRIu64, + max_write_buffer_size_to_maintain); ROCKS_LOG_HEADER( log, " Options.bottommost_compression_opts.window_bits: %d", bottommost_compression_opts.window_bits); @@ -215,9 +216,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.max_bytes_for_level_base: %" PRIu64, max_bytes_for_level_base); - ROCKS_LOG_HEADER( - log, " Options.snap_refresh_nanos: %" PRIu64, - snap_refresh_nanos); ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d", level_compaction_dynamic_level_bytes); ROCKS_LOG_HEADER(log, " Options.max_bytes_for_level_multiplier: %f", @@ -493,7 +491,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( write_buffer_size = 2 << 20; target_file_size_base = 2 * 1048576; max_bytes_for_level_base = 10 * 1048576; - snap_refresh_nanos = 0; soft_pending_compaction_bytes_limit = 256 * 1048576; hard_pending_compaction_bytes_limit = 1073741824ul; @@ -506,7 +503,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; table_factory.reset(new BlockBasedTableFactory(table_options)); - return this; } @@ -552,7 +548,10 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction( if (i < 2) { compression_per_level[i] = kNoCompression; } else { - compression_per_level[i] = kSnappyCompression; + compression_per_level[i] = + LZ4_Supported() + ? kLZ4Compression + : (Snappy_Supported() ? kSnappyCompression : kNoCompression); } } return this; @@ -597,7 +596,8 @@ ReadOptions::ReadOptions() pin_data(false), background_purge_on_iterator_cleanup(false), ignore_range_deletions(false), - iter_start_seqnum(0) {} + iter_start_seqnum(0), + timestamp(nullptr) {} ReadOptions::ReadOptions(bool cksum, bool cache) : snapshot(nullptr), @@ -615,6 +615,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache) pin_data(false), background_purge_on_iterator_cleanup(false), ignore_range_deletions(false), - iter_start_seqnum(0) {} + iter_start_seqnum(0), + timestamp(nullptr) {} } // namespace rocksdb diff --git a/options/options_helper.cc b/options/options_helper.cc index a973bbfde51..d30264e8234 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -9,6 +9,7 @@ #include #include #include + #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" @@ -20,8 +21,8 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/utilities/object_registry.h" -#include "table/block_based_table_factory.h" -#include "table/plain_table_factory.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "util/cast_util.h" #include "util/string_util.h" @@ -83,6 +84,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.stats_dump_period_sec = mutable_db_options.stats_dump_period_sec; options.stats_persist_period_sec = mutable_db_options.stats_persist_period_sec; + options.persist_stats_to_disk = immutable_db_options.persist_stats_to_disk; options.stats_history_buffer_size = mutable_db_options.stats_history_buffer_size; options.advise_random_on_open = immutable_db_options.advise_random_on_open; @@ -103,10 +105,13 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.enable_thread_tracking = immutable_db_options.enable_thread_tracking; options.delayed_write_rate = mutable_db_options.delayed_write_rate; options.enable_pipelined_write = immutable_db_options.enable_pipelined_write; + options.unordered_write = immutable_db_options.unordered_write; options.allow_concurrent_memtable_write = immutable_db_options.allow_concurrent_memtable_write; options.enable_write_thread_adaptive_yield = immutable_db_options.enable_write_thread_adaptive_yield; + options.max_write_batch_group_size_bytes = + immutable_db_options.max_write_batch_group_size_bytes; options.write_thread_max_yield_usec = immutable_db_options.write_thread_max_yield_usec; options.write_thread_slow_yield_usec = @@ -135,7 +140,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.atomic_flush = immutable_db_options.atomic_flush; options.avoid_unnecessary_blocking_io = immutable_db_options.avoid_unnecessary_blocking_io; - + options.log_readahead_size = immutable_db_options.log_readahead_size; return options; } @@ -177,7 +182,6 @@ ColumnFamilyOptions BuildColumnFamilyOptions( mutable_cf_options.target_file_size_multiplier; cf_opts.max_bytes_for_level_base = mutable_cf_options.max_bytes_for_level_base; - cf_opts.snap_refresh_nanos = mutable_cf_options.snap_refresh_nanos; cf_opts.max_bytes_for_level_multiplier = mutable_cf_options.max_bytes_for_level_multiplier; cf_opts.ttl = mutable_cf_options.ttl; @@ -254,7 +258,7 @@ const std::string kNameMergeOperator = "merge_operator"; template Status GetStringFromStruct( std::string* opt_string, const T& options, - const std::unordered_map type_info, + const std::unordered_map& type_info, const std::string& delimiter); namespace { @@ -349,7 +353,7 @@ bool FIFOCompactionOptionsSpecialCase(const std::string& opt_str, template bool SerializeStruct( const T& options, std::string* value, - std::unordered_map type_info_map) { + const std::unordered_map& type_info_map) { std::string opt_str; Status s = GetStringFromStruct(&opt_str, options, type_info_map, ";"); if (!s.ok()) { @@ -362,7 +366,7 @@ bool SerializeStruct( template bool ParseSingleStructOption( const std::string& opt_val_str, T* options, - std::unordered_map type_info_map) { + const std::unordered_map& type_info_map) { size_t end = opt_val_str.find('='); std::string key = opt_val_str.substr(0, end); std::string value = opt_val_str.substr(end + 1); @@ -371,6 +375,11 @@ bool ParseSingleStructOption( return false; } const auto& opt_info = iter->second; + if (opt_info.verification == OptionVerificationType::kDeprecated) { + // Should also skip deprecated sub-options such as + // fifo_compaction_options_type_info.ttl + return true; + } return ParseOptionHelper( reinterpret_cast(options) + opt_info.mutable_offset, opt_info.type, value); @@ -379,7 +388,7 @@ bool ParseSingleStructOption( template bool ParseStructOptions( const std::string& opt_str, T* options, - std::unordered_map type_info_map) { + const std::unordered_map& type_info_map) { assert(!opt_str.empty()); size_t start = 0; @@ -1037,21 +1046,21 @@ Status ParseColumnFamilyOption(const std::string& name, } else { if (name == kNameComparator) { // Try to get comparator from object registry first. - std::unique_ptr comp_guard; - const Comparator* comp = - NewCustomObject(value, &comp_guard); // Only support static comparator for now. - if (comp != nullptr && !comp_guard) { - new_options->comparator = comp; + Status status = ObjectRegistry::NewInstance()->NewStaticObject( + value, &new_options->comparator); + if (status.ok()) { + return status; } } else if (name == kNameMergeOperator) { // Try to get merge operator from object registry first. - std::unique_ptr> mo_guard; - std::shared_ptr* mo = - NewCustomObject>(value, &mo_guard); + std::shared_ptr mo; + Status status = + ObjectRegistry::NewInstance()->NewSharedObject( + value, &new_options->merge_operator); // Only support static comparator for now. - if (mo != nullptr) { - new_options->merge_operator = *mo; + if (status.ok()) { + return status; } } @@ -1091,7 +1100,7 @@ Status ParseColumnFamilyOption(const std::string& name, template bool SerializeSingleStructOption( std::string* opt_string, const T& options, - const std::unordered_map type_info, + const std::unordered_map& type_info, const std::string& name, const std::string& delimiter) { auto iter = type_info.find(name); if (iter == type_info.end()) { @@ -1111,7 +1120,7 @@ bool SerializeSingleStructOption( template Status GetStringFromStruct( std::string* opt_string, const T& options, - const std::unordered_map type_info, + const std::unordered_map& type_info, const std::string& delimiter) { assert(opt_string); opt_string->clear(); @@ -1183,10 +1192,10 @@ Status ParseDBOption(const std::string& name, NewGenericRateLimiter(static_cast(ParseUint64(value)))); } else if (name == kNameEnv) { // Currently `Env` can be deserialized from object registry only. - std::unique_ptr env_guard; - Env* env = NewCustomObject(value, &env_guard); + Env* env = new_options->env; + Status status = Env::LoadEnv(value, &env); // Only support static env for now. - if (env != nullptr && !env_guard) { + if (status.ok()) { new_options->env = env; } } else { @@ -1573,6 +1582,10 @@ std::unordered_map {offsetof(struct DBOptions, stats_persist_period_sec), OptionType::kUInt, OptionVerificationType::kNormal, true, offsetof(struct MutableDBOptions, stats_persist_period_sec)}}, + {"persist_stats_to_disk", + {offsetof(struct DBOptions, persist_stats_to_disk), + OptionType::kBoolean, OptionVerificationType::kNormal, false, + offsetof(struct ImmutableDBOptions, persist_stats_to_disk)}}, {"stats_history_buffer_size", {offsetof(struct DBOptions, stats_history_buffer_size), OptionType::kSizeT, OptionVerificationType::kNormal, true, @@ -1583,6 +1596,9 @@ std::unordered_map {"enable_pipelined_write", {offsetof(struct DBOptions, enable_pipelined_write), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"unordered_write", + {offsetof(struct DBOptions, unordered_write), OptionType::kBoolean, + OptionVerificationType::kNormal, false, 0}}, {"allow_concurrent_memtable_write", {offsetof(struct DBOptions, allow_concurrent_memtable_write), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, @@ -1596,6 +1612,9 @@ std::unordered_map {"write_thread_slow_yield_usec", {offsetof(struct DBOptions, write_thread_slow_yield_usec), OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, + {"max_write_batch_group_size_bytes", + {offsetof(struct DBOptions, max_write_batch_group_size_bytes), + OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, {"write_thread_max_yield_usec", {offsetof(struct DBOptions, write_thread_max_yield_usec), OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}}, @@ -1649,6 +1668,12 @@ std::unordered_map {offsetof(struct DBOptions, avoid_unnecessary_blocking_io), OptionType::kBoolean, OptionVerificationType::kNormal, false, offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}}, + {"write_dbid_to_manifest", + {offsetof(struct DBOptions, write_dbid_to_manifest), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"log_readahead_size", + {offsetof(struct DBOptions, log_readahead_size), OptionType::kSizeT, + OptionVerificationType::kNormal, false, 0}}, }; std::unordered_map @@ -1656,7 +1681,9 @@ std::unordered_map {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch}, {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch}, {"kTwoLevelIndexSearch", - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}}; + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}, + {"kBinarySearchWithFirstKey", + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}}; std::unordered_map OptionsHelper::block_base_table_data_block_index_type_string_map = { @@ -1853,6 +1880,9 @@ std::unordered_map {"max_write_buffer_number_to_maintain", {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain), OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, + {"max_write_buffer_size_to_maintain", + {offset_of(&ColumnFamilyOptions::max_write_buffer_size_to_maintain), + OptionType::kInt64T, OptionVerificationType::kNormal, false, 0}}, {"min_write_buffer_number_to_merge", {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge), OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, @@ -1912,9 +1942,8 @@ std::unordered_map OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_bytes_for_level_base)}}, {"snap_refresh_nanos", - {offset_of(&ColumnFamilyOptions::snap_refresh_nanos), - OptionType::kUInt64T, OptionVerificationType::kNormal, true, - offsetof(struct MutableCFOptions, snap_refresh_nanos)}}, + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, true, + 0}}, {"max_bytes_for_level_multiplier", {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier), OptionType::kDouble, OptionVerificationType::kNormal, true, diff --git a/options/options_parser.cc b/options/options_parser.cc index f09e53e4a49..6d38f019265 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -13,13 +13,14 @@ #include #include +#include "file/read_write_util.h" +#include "file/writable_file_writer.h" #include "options/options_helper.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" +#include "test_util/sync_point.h" #include "util/cast_util.h" -#include "util/file_reader_writer.h" #include "util/string_util.h" -#include "util/sync_point.h" #include "port/port.h" diff --git a/options/options_parser.h b/options/options_parser.h index 5aab3e7e9b6..b2a806f179f 100644 --- a/options/options_parser.h +++ b/options/options_parser.h @@ -12,7 +12,7 @@ #include "options/options_sanity_check.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" namespace rocksdb { diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 2d6cc11c02e..bc2e088a6e4 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -7,15 +7,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include #include "options/options_helper.h" #include "rocksdb/convenience.h" -#include "util/testharness.h" +#include "test_util/testharness.h" #ifndef GFLAGS bool FLAGS_enable_print = false; @@ -233,6 +229,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "delete_obsolete_files_period_micros=4294967758;" "WAL_ttl_seconds=4295008036;" "WAL_size_limit_MB=4295036161;" + "max_write_batch_group_size_bytes=1048576;" "wal_dir=path/to/wal_dir;" "db_write_buffer_size=2587;" "max_subcompactions=64330;" @@ -269,6 +266,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "allow_mmap_writes=false;" "stats_dump_period_sec=70127;" "stats_persist_period_sec=54321;" + "persist_stats_to_disk=true;" "stats_history_buffer_size=14159;" "allow_fallocate=true;" "allow_mmap_reads=false;" @@ -279,6 +277,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "advise_random_on_open=true;" "fail_if_options_file_error=false;" "enable_pipelined_write=false;" + "unordered_write=false;" "allow_concurrent_memtable_write=true;" "wal_recovery_mode=kPointInTimeRecovery;" "enable_write_thread_adaptive_yield=true;" @@ -297,7 +296,9 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "manual_wal_flush=false;" "seq_per_batch=false;" "atomic_flush=false;" - "avoid_unnecessary_blocking_io=false", + "avoid_unnecessary_blocking_io=false;" + "log_readahead_size=0;" + "write_dbid_to_manifest=false", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), @@ -352,10 +353,10 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { sizeof(std::shared_ptr)}, {offset_of(&ColumnFamilyOptions::prefix_extractor), sizeof(std::shared_ptr)}, + {offset_of(&ColumnFamilyOptions::snap_refresh_nanos), sizeof(uint64_t)}, {offset_of(&ColumnFamilyOptions::table_factory), sizeof(std::shared_ptr)}, - {offset_of(&ColumnFamilyOptions::cf_paths), - sizeof(std::vector)}, + {offset_of(&ColumnFamilyOptions::cf_paths), sizeof(std::vector)}, {offset_of(&ColumnFamilyOptions::compaction_thread_limiter), sizeof(std::shared_ptr)}, }; @@ -415,7 +416,6 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:" "kSnappyCompression;" "max_bytes_for_level_base=986;" - "snap_refresh_nanos=1000000000;" "bloom_locality=8016;" "target_file_size_base=4294976376;" "memtable_huge_page_size=2557;" @@ -439,6 +439,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "soft_rate_limit=530.615385;" "soft_pending_compaction_bytes_limit=0;" "max_write_buffer_number_to_maintain=84;" + "max_write_buffer_size_to_maintain=2147483648;" "merge_operator=aabcxehazrMergeOperator;" "memtable_prefix_bloom_size_ratio=0.4642;" "memtable_whole_key_filtering=true;" diff --git a/options/options_test.cc b/options/options_test.cc index ded336dd18d..d1c9db039d4 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -7,14 +7,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include +#include #include #include -#include #include "cache/lru_cache.h" #include "cache/sharded_cache.h" @@ -27,11 +23,12 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/utilities/leveldb_options.h" #include "rocksdb/utilities/object_registry.h" +#include "table/block_based/filter_policy_internal.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/random.h" #include "util/stderr_logger.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators/bytesxor.h" #ifndef GFLAGS @@ -53,6 +50,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"max_write_buffer_number", "2"}, {"min_write_buffer_number_to_merge", "3"}, {"max_write_buffer_number_to_maintain", "99"}, + {"max_write_buffer_size_to_maintain", "-99999"}, {"compression", "kSnappyCompression"}, {"compression_per_level", "kNoCompression:" @@ -74,7 +72,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"target_file_size_base", "12"}, {"target_file_size_multiplier", "13"}, {"max_bytes_for_level_base", "14"}, - {"snap_refresh_nanos", "1000000000"}, {"level_compaction_dynamic_level_bytes", "true"}, {"max_bytes_for_level_multiplier", "15.0"}, {"max_bytes_for_level_multiplier_additional", "16:17:18"}, @@ -133,6 +130,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"skip_log_error_on_recovery", "false"}, {"stats_dump_period_sec", "46"}, {"stats_persist_period_sec", "57"}, + {"persist_stats_to_disk", "false"}, {"stats_history_buffer_size", "69"}, {"advise_random_on_open", "true"}, {"use_adaptive_mutex", "false"}, @@ -153,6 +151,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2); ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3); ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99); + ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999); ASSERT_EQ(new_cf_opt.compression, kSnappyCompression); ASSERT_EQ(new_cf_opt.compression_per_level.size(), 9U); ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression); @@ -167,15 +166,15 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4); ASSERT_EQ(new_cf_opt.compression_opts.level, 5); ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6); - ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7); - ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8); + ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u); + ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u); ASSERT_EQ(new_cf_opt.compression_opts.enabled, true); ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7); - ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8); - ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u); + ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u); ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true); ASSERT_EQ(new_cf_opt.num_levels, 8); ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8); @@ -184,7 +183,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast(12)); ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13); ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U); - ASSERT_EQ(new_cf_opt.snap_refresh_nanos, 1000000000U); ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0); ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U); @@ -271,6 +269,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false); ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U); ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U); + ASSERT_EQ(new_db_opt.persist_stats_to_disk, false); ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U); ASSERT_EQ(new_db_opt.advise_random_on_open, true); ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); @@ -343,11 +342,11 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { // Comparator from object registry std::string kCompName = "reverse_comp"; - static Registrar test_reg_a( - kCompName, [](const std::string& /*name*/, - std::unique_ptr* /*comparator_guard*/) { - return ReverseBytewiseComparator(); - }); + ObjectLibrary::Default()->Register( + kCompName, + [](const std::string& /*name*/, + std::unique_ptr* /*guard*/, + std::string* /* errmsg */) { return ReverseBytewiseComparator(); }); ASSERT_OK(GetColumnFamilyOptionsFromString( base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt)); @@ -356,13 +355,12 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { // MergeOperator from object registry std::unique_ptr bxo(new BytesXOROperator()); std::string kMoName = bxo->Name(); - static Registrar> test_reg_b( - kMoName, [](const std::string& /*name*/, - std::unique_ptr>* - merge_operator_guard) { - merge_operator_guard->reset( - new std::shared_ptr(new BytesXOROperator())); - return merge_operator_guard->get(); + ObjectLibrary::Default()->Register( + kMoName, + [](const std::string& /*name*/, std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new BytesXOROperator()); + return guard->get(); }); ASSERT_OK(GetColumnFamilyOptionsFromString( @@ -384,10 +382,10 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { "write_buffer_size=13; =100;", &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt)); - const int64_t kilo = 1024UL; - const int64_t mega = 1024 * kilo; - const int64_t giga = 1024 * mega; - const int64_t tera = 1024 * giga; + const uint64_t kilo = 1024UL; + const uint64_t mega = 1024 * kilo; + const uint64_t giga = 1024 * mega; + const uint64_t tera = 1024 * giga; // Units (k) ASSERT_OK(GetColumnFamilyOptionsFromString( @@ -398,7 +396,7 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { "max_write_buffer_number=16m;inplace_update_num_locks=17M", &new_cf_opt)); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega); - ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17 * mega); + ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega); // Units (g) ASSERT_OK(GetColumnFamilyOptionsFromString( base_cf_opt, @@ -518,13 +516,15 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { BlockBasedTableOptions table_opt; BlockBasedTableOptions new_opt; // make sure default values are overwritten by something else - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;index_type=kHashSearch;" - "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" - "block_cache=1M;block_cache_compressed=1k;block_size=1024;" - "block_size_deviation=8;block_restart_interval=4;" - "filter_policy=bloomfilter:4:true;whole_key_filtering=1;", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + table_opt, + "cache_index_and_filter_blocks=1;index_type=kHashSearch;" + "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" + "block_cache=1M;block_cache_compressed=1k;block_size=1024;" + "block_size_deviation=8;block_restart_interval=4;" + "format_version=5;whole_key_filtering=1;" + "filter_policy=bloomfilter:4.567:false;", + &new_opt)); ASSERT_TRUE(new_opt.cache_index_and_filter_blocks); ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch); ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash); @@ -537,14 +537,20 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.block_size, 1024UL); ASSERT_EQ(new_opt.block_size_deviation, 8); ASSERT_EQ(new_opt.block_restart_interval, 4); + ASSERT_EQ(new_opt.format_version, 5U); + ASSERT_EQ(new_opt.whole_key_filtering, true); ASSERT_TRUE(new_opt.filter_policy != nullptr); + const BloomFilterPolicy& bfp = + dynamic_cast(*new_opt.filter_policy); + EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567); + EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5); // unknown option ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, "cache_index_and_filter_blocks=1;index_type=kBinarySearch;" "bad_option=1", &new_opt)); - ASSERT_EQ(table_opt.cache_index_and_filter_blocks, + ASSERT_EQ(static_cast(table_opt.cache_index_and_filter_blocks), new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.index_type, new_opt.index_type); @@ -619,8 +625,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { new_opt.block_cache)->GetNumShardBits(), GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity())); ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache)->GetHighPriPoolRatio(), 0.0); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) + ->GetHighPriPoolRatio(), + 0.5); ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL); // Default values @@ -629,16 +636,17 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { GetDefaultCacheShardBits( new_opt.block_cache_compressed->GetCapacity())); ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed)->GetHighPriPoolRatio(), - 0.0); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.5); // Set couple of block cache options. - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};" - "block_cache_compressed={num_shard_bits=5;" - "high_pri_pool_ratio=0.5;}", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + table_opt, + "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};" + "block_cache_compressed={num_shard_bits=5;" + "high_pri_pool_ratio=0.0;}", + &new_opt)); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0); ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache)->GetNumShardBits(), 5); @@ -650,9 +658,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache_compressed)->GetNumShardBits(), 5); ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed)->GetHighPriPoolRatio(), - 0.5); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.0); // Set couple of block cache options. ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, @@ -666,16 +674,17 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache)->GetNumShardBits(), 4); ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache)->GetHighPriPoolRatio(), 0.0); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) + ->GetHighPriPoolRatio(), + 0.5); ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL); ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache_compressed)->GetNumShardBits(), 4); ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed)->GetHighPriPoolRatio(), - 0.0); + ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) + ->GetHighPriPoolRatio(), + 0.5); } #endif // !ROCKSDB_LITE @@ -690,7 +699,7 @@ TEST_F(OptionsTest, GetPlainTableOptionsFromString) { "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;" "full_scan_mode=true;store_index_in_file=true", &new_opt)); - ASSERT_EQ(new_opt.user_key_len, 66); + ASSERT_EQ(new_opt.user_key_len, 66u); ASSERT_EQ(new_opt.bloom_bits_per_key, 20); ASSERT_EQ(new_opt.hash_table_ratio, 0.5); ASSERT_EQ(new_opt.index_sparseness, 8); @@ -769,9 +778,10 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) { explicit CustomEnv(Env* _target) : EnvWrapper(_target) {} }; - static Registrar test_reg_env( + ObjectLibrary::Default()->Register( kCustomEnvName, - [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/) { + [](const std::string& /*name*/, std::unique_ptr* /*env_guard*/, + std::string* /* errmsg */) { static CustomEnv env(Env::Default()); return &env; }); @@ -789,15 +799,15 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) { ASSERT_EQ(new_options.compression_opts.window_bits, 4); ASSERT_EQ(new_options.compression_opts.level, 5); ASSERT_EQ(new_options.compression_opts.strategy, 6); - ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0); - ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0); + ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u); + ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u); ASSERT_EQ(new_options.compression_opts.enabled, false); ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption); ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5); ASSERT_EQ(new_options.bottommost_compression_opts.level, 6); ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7); - ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0); - ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0); + ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u); + ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u); ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false); ASSERT_EQ(new_options.write_buffer_size, 10U); ASSERT_EQ(new_options.max_write_buffer_number, 16); @@ -812,8 +822,9 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) { ASSERT_EQ(new_options.create_if_missing, true); ASSERT_EQ(new_options.max_open_files, 1); ASSERT_TRUE(new_options.rate_limiter.get() != nullptr); - std::unique_ptr env_guard; - ASSERT_EQ(NewCustomObject(kCustomEnvName, &env_guard), new_options.env); + Env* newEnv = new_options.env; + ASSERT_OK(Env::LoadEnv(kCustomEnvName, &newEnv)); + ASSERT_EQ(newEnv, new_options.env); } TEST_F(OptionsTest, DBOptionsSerialization) { @@ -842,7 +853,7 @@ TEST_F(OptionsTest, OptionsComposeDecompose) { Random rnd(301); test::RandomInitDBOptions(&base_db_opts, &rnd); - test::RandomInitCFOptions(&base_cf_opts, &rnd); + test::RandomInitCFOptions(&base_cf_opts, base_db_opts, &rnd); Options base_opts(base_db_opts, base_cf_opts); DBOptions new_db_opts(base_opts); @@ -854,11 +865,12 @@ TEST_F(OptionsTest, OptionsComposeDecompose) { } TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) { + Options options; ColumnFamilyOptions base_opt, new_opt; Random rnd(302); // Phase 1: randomly assign base_opt // custom type options - test::RandomInitCFOptions(&base_opt, &rnd); + test::RandomInitCFOptions(&base_opt, options, &rnd); // Phase 2: obtain a string from base_opt std::string base_options_file_content; @@ -1521,7 +1533,7 @@ TEST_F(OptionsParserTest, DumpAndParse) { for (int c = 0; c < num_cf; ++c) { ColumnFamilyOptions cf_opt; Random cf_rnd(0xFB + c); - test::RandomInitCFOptions(&cf_opt, &cf_rnd); + test::RandomInitCFOptions(&cf_opt, base_db_opt, &cf_rnd); if (c < 4) { cf_opt.prefix_extractor.reset(test::RandomSliceTransform(&rnd, c)); } @@ -1869,9 +1881,9 @@ TEST_F(OptionsParserTest, IntegerParsing) { ASSERT_EQ(ParseUint64("18446744073709551615"), 18446744073709551615U); ASSERT_EQ(ParseUint32("4294967295"), 4294967295U); ASSERT_EQ(ParseSizeT("18446744073709551615"), 18446744073709551615U); - ASSERT_EQ(ParseInt64("9223372036854775807"), 9223372036854775807U); + ASSERT_EQ(ParseInt64("9223372036854775807"), 9223372036854775807); ASSERT_EQ(ParseInt64("-9223372036854775808"), port::kMinInt64); - ASSERT_EQ(ParseInt32("2147483647"), 2147483647U); + ASSERT_EQ(ParseInt32("2147483647"), 2147483647); ASSERT_EQ(ParseInt32("-2147483648"), port::kMinInt32); ASSERT_EQ(ParseInt("-32767"), -32767); ASSERT_EQ(ParseDouble("-1.234567"), -1.234567); diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h index 0c216face13..f6f72f8cb8c 100644 --- a/port/jemalloc_helper.h +++ b/port/jemalloc_helper.h @@ -5,10 +5,24 @@ #pragma once +#if defined(__clang__) +// glibc's `posix_memalign()` declaration specifies `throw()` while clang's +// declaration does not. There is a hack in clang to make its re-declaration +// compatible with glibc's if they are declared consecutively. That hack breaks +// if yet another `posix_memalign()` declaration comes between glibc's and +// clang's declarations. Include "mm_malloc.h" here ensures glibc's and clang's +// declarations both come before "jemalloc.h"'s `posix_memalign()` declaration. +// +// This problem could also be avoided if "jemalloc.h"'s `posix_memalign()` +// declaration did not specify `throw()` when built with clang. +#include +#endif + #ifdef ROCKSDB_JEMALLOC #ifdef __FreeBSD__ #include #else +#define JEMALLOC_MANGLE #include #endif @@ -16,6 +30,14 @@ #define JEMALLOC_CXX_THROW #endif +#if defined(OS_WIN) && defined(_MSC_VER) + +// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is +// defined, Jemalloc memory allocator is used. +static inline bool HasJemalloc() { return true; } + +#else + // Declare non-standard jemalloc APIs as weak symbols. We can null-check these // symbols to detect whether jemalloc is linked with the binary. extern "C" void* mallocx(size_t, int) __attribute__((__weak__)); @@ -50,4 +72,6 @@ static inline bool HasJemalloc() { malloc_stats_print != nullptr && malloc_usable_size != nullptr; } +#endif + #endif // ROCKSDB_JEMALLOC diff --git a/util/filter_policy.cc b/port/malloc.h similarity index 63% rename from util/filter_policy.cc rename to port/malloc.h index efb9bf4763c..f973263e2ae 100644 --- a/util/filter_policy.cc +++ b/port/malloc.h @@ -3,14 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once -#include "rocksdb/filter_policy.h" - -namespace rocksdb { - -FilterPolicy::~FilterPolicy() { } - -} // namespace rocksdb +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else +#include +#endif // OS_FREEBSD +#endif // ROCKSDB_MALLOC_USABLE_SIZE diff --git a/port/port_posix.cc b/port/port_posix.cc index 80081e480e0..167159d83c8 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -18,11 +18,11 @@ #include #include #include -#include #include +#include #include #include -#include "util/logging.h" +#include "logging/logging.h" namespace rocksdb { @@ -192,7 +192,8 @@ int GetMaxOpenFiles() { return -1; } // protect against overflow - if (no_files_limit.rlim_cur >= std::numeric_limits::max()) { + if (static_cast(no_files_limit.rlim_cur) >= + static_cast(std::numeric_limits::max())) { return std::numeric_limits::max(); } return static_cast(no_files_limit.rlim_cur); diff --git a/port/port_posix.h b/port/port_posix.h index 63d7239fe6d..892089831ff 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -104,6 +104,10 @@ class CondVar; class Mutex { public: explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex); + // No copying + Mutex(const Mutex&) = delete; + void operator=(const Mutex&) = delete; + ~Mutex(); void Lock(); @@ -118,15 +122,15 @@ class Mutex { #ifndef NDEBUG bool locked_; #endif - - // No copying - Mutex(const Mutex&); - void operator=(const Mutex&); }; class RWMutex { public: RWMutex(); + // No copying allowed + RWMutex(const RWMutex&) = delete; + void operator=(const RWMutex&) = delete; + ~RWMutex(); void ReadLock(); @@ -137,10 +141,6 @@ class RWMutex { private: pthread_rwlock_t mu_; // the underlying platform mutex - - // No copying allowed - RWMutex(const RWMutex&); - void operator=(const RWMutex&); }; class CondVar { @@ -178,22 +178,31 @@ typedef pthread_once_t OnceType; extern void InitOnce(OnceType* once, void (*initializer)()); #ifndef CACHE_LINE_SIZE - #if defined(__s390__) - #define CACHE_LINE_SIZE 256U - #elif defined(__powerpc__) || defined(__aarch64__) - #define CACHE_LINE_SIZE 128U - #else - #define CACHE_LINE_SIZE 64U - #endif +// To test behavior with non-native cache line size, e.g. for +// Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size. +// This disables ALIGN_AS to keep it from failing compilation. +#ifdef TEST_CACHE_LINE_SIZE +#define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE +#define ALIGN_AS(n) /*empty*/ +#else +#if defined(__s390__) +#define CACHE_LINE_SIZE 256U +#elif defined(__powerpc__) || defined(__aarch64__) +#define CACHE_LINE_SIZE 128U +#else +#define CACHE_LINE_SIZE 64U +#endif +#define ALIGN_AS(n) alignas(n) +#endif #endif +static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0, + "Cache line size must be a power of 2 number of bytes"); extern void *cacheline_aligned_alloc(size_t size); extern void cacheline_aligned_free(void *memblock); -#define ALIGN_AS(n) alignas(n) - #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) extern void Crash(const std::string& srcfile, int srcline); diff --git a/port/util_logger.h b/port/util_logger.h index ba424705b27..d2d62a9879c 100644 --- a/port/util_logger.h +++ b/port/util_logger.h @@ -14,7 +14,7 @@ // of what the new port_.h file must provide. #if defined(ROCKSDB_PLATFORM_POSIX) -#include "env/posix_logger.h" +#include "logging/posix_logger.h" #elif defined(OS_WIN) #include "port/win/win_logger.h" #endif diff --git a/port/win/env_default.cc b/port/win/env_default.cc index d24c21918aa..584a524cf86 100644 --- a/port/win/env_default.cc +++ b/port/win/env_default.cc @@ -11,8 +11,8 @@ #include #include "port/win/env_win.h" +#include "test_util/sync_point.h" #include "util/compression_context_cache.h" -#include "util/sync_point.h" #include "util/thread_local.h" namespace rocksdb { diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 9abb14d67ea..c12d0ee4fc2 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -979,8 +979,8 @@ uint64_t WinEnvIO::NowMicros() { return li.QuadPart; } using namespace std::chrono; - return duration_cast( - high_resolution_clock::now().time_since_epoch()).count(); + return duration_cast(system_clock::now().time_since_epoch()) + .count(); } uint64_t WinEnvIO::NowNanos() { diff --git a/port/win/io_win.cc b/port/win/io_win.cc index 64ded8465d0..c433e5e522f 100644 --- a/port/win/io_win.cc +++ b/port/win/io_win.cc @@ -10,9 +10,9 @@ #include "port/win/io_win.h" #include "monitoring/iostats_context_imp.h" +#include "test_util/sync_point.h" #include "util/aligned_buffer.h" #include "util/coding.h" -#include "util/sync_point.h" namespace rocksdb { namespace port { @@ -175,60 +175,18 @@ Status ftruncate(const std::string& filename, HANDLE hFile, return status; } -size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) { - - if (max_size < kMaxVarint64Length * 3) { - return 0; - } -#if (_WIN32_WINNT == _WIN32_WINNT_VISTA) - // MINGGW as defined by CMake file. - // yuslepukhin: I hate the guts of the above macros. - // This impl does not guarantee uniqueness everywhere - // is reasonably good - BY_HANDLE_FILE_INFORMATION FileInfo; - - BOOL result = GetFileInformationByHandle(hFile, &FileInfo); - - TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); - - if (!result) { - return 0; - } - - char* rid = id; - rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber)); - rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh)); - rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow)); - - assert(rid >= id); - return static_cast(rid - id); -#else - FILE_ID_INFO FileInfo; - BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo, - sizeof(FileInfo)); - - TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); - - if (!result) { - return 0; - } - - static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber), - "Wrong sizeof expectations"); - // FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t - static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier), - "Wrong sizeof expectations"); - - char* rid = id; - rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber)); - uint64_t* file_id = reinterpret_cast(&FileInfo.FileId.Identifier[0]); - rid = EncodeVarint64(rid, *file_id); - ++file_id; - rid = EncodeVarint64(rid, *file_id); - - assert(rid >= id); - return static_cast(rid - id); -#endif +size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/, + size_t /*max_size*/) { + // Returning 0 is safe as it causes the table reader to generate a unique ID. + // This is suboptimal for performance as it prevents multiple table readers + // for the same file from sharing cached blocks. For example, if users have + // a low value for `max_open_files`, there can be many table readers opened + // for the same file. + // + // TODO: this is a temporarily solution as it is safe but not optimal for + // performance. For more details see discussion in + // https://github.com/facebook/rocksdb/pull/5844. + return 0; } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1067,7 +1025,12 @@ Status WinRandomRWFile::Close() { ////////////////////////////////////////////////////////////////////////// /// WinMemoryMappedBufer WinMemoryMappedBuffer::~WinMemoryMappedBuffer() { - BOOL ret = FALSE; + BOOL ret +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif if (base_ != nullptr) { ret = ::UnmapViewOfFile(base_); assert(ret); diff --git a/port/win/port_win.cc b/port/win/port_win.cc index 03ba6ef4281..31e65e78cde 100644 --- a/port/win/port_win.cc +++ b/port/win/port_win.cc @@ -33,7 +33,7 @@ #include #endif -#include "util/logging.h" +#include "logging/logging.h" namespace rocksdb { diff --git a/port/win/port_win.h b/port/win/port_win.h index de41cdc7f01..1b302b3d211 100644 --- a/port/win/port_win.h +++ b/port/win/port_win.h @@ -180,6 +180,9 @@ class Mutex { class RWMutex { public: RWMutex() { InitializeSRWLock(&srwLock_); } + // No copying allowed + RWMutex(const RWMutex&) = delete; + void operator=(const RWMutex&) = delete; void ReadLock() { AcquireSRWLockShared(&srwLock_); } @@ -194,9 +197,6 @@ class RWMutex { private: SRWLOCK srwLock_; - // No copying allowed - RWMutex(const RWMutex&); - void operator=(const RWMutex&); }; class CondVar { diff --git a/port/win/win_jemalloc.cc b/port/win/win_jemalloc.cc index 3268a56affd..b2077938806 100644 --- a/port/win/win_jemalloc.cc +++ b/port/win/win_jemalloc.cc @@ -1,7 +1,7 @@ // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be diff --git a/port/win/win_thread.cc b/port/win/win_thread.cc index 9a976e2c6b8..34e54f30171 100644 --- a/port/win/win_thread.cc +++ b/port/win/win_thread.cc @@ -138,7 +138,12 @@ void WindowsThread::join() { "WaitForSingleObjectFailed: thread join"); } - BOOL rc; + BOOL rc +#if defined(_MSC_VER) + = FALSE; +#else + __attribute__((__unused__)); +#endif rc = CloseHandle(reinterpret_cast(data_->handle_)); assert(rc != 0); data_->handle_ = 0; diff --git a/src.mk b/src.mk index 55b4e3427c6..19f03123730 100644 --- a/src.mk +++ b/src.mk @@ -3,27 +3,29 @@ LIB_SOURCES = \ cache/clock_cache.cc \ cache/lru_cache.cc \ cache/sharded_cache.cc \ + db/arena_wrapped_db_iter.cc \ db/builder.cc \ db/c.cc \ db/column_family.cc \ db/compacted_db_impl.cc \ - db/compaction.cc \ - db/compaction_iterator.cc \ - db/compaction_job.cc \ - db/compaction_picker.cc \ - db/compaction_picker_fifo.cc \ - db/compaction_picker_universal.cc \ + db/compaction/compaction.cc \ + db/compaction/compaction_iterator.cc \ + db/compaction/compaction_job.cc \ + db/compaction/compaction_picker.cc \ + db/compaction/compaction_picker_fifo.cc \ + db/compaction/compaction_picker_level.cc \ + db/compaction/compaction_picker_universal.cc \ db/convenience.cc \ db/db_filesnapshot.cc \ - db/db_impl.cc \ - db/db_impl_compaction_flush.cc \ - db/db_impl_debug.cc \ - db/db_impl_experimental.cc \ - db/db_impl_files.cc \ - db/db_impl_open.cc \ - db/db_impl_readonly.cc \ - db/db_impl_secondary.cc \ - db/db_impl_write.cc \ + db/db_impl/db_impl.cc \ + db/db_impl/db_impl_compaction_flush.cc \ + db/db_impl/db_impl_debug.cc \ + db/db_impl/db_impl_experimental.cc \ + db/db_impl/db_impl_files.cc \ + db/db_impl/db_impl_open.cc \ + db/db_impl/db_impl_readonly.cc \ + db/db_impl/db_impl_secondary.cc \ + db/db_impl/db_impl_write.cc \ db/db_info_dumper.cc \ db/db_iter.cc \ db/dbformat.cc \ @@ -35,7 +37,7 @@ LIB_SOURCES = \ db/flush_job.cc \ db/flush_scheduler.cc \ db/forward_iterator.cc \ - db/in_memory_stats_history.cc \ + db/import_column_family_job.cc \ db/internal_stats.cc \ db/logs_with_prep_tracker.cc \ db/log_reader.cc \ @@ -52,6 +54,7 @@ LIB_SOURCES = \ db/table_cache.cc \ db/table_properties_collector.cc \ db/transaction_log_impl.cc \ + db/trim_history_scheduler.cc \ db/version_builder.cc \ db/version_edit.cc \ db/version_set.cc \ @@ -67,6 +70,22 @@ LIB_SOURCES = \ env/env_posix.cc \ env/io_posix.cc \ env/mock_env.cc \ + file/delete_scheduler.cc \ + file/file_prefetch_buffer.cc \ + file/file_util.cc \ + file/filename.cc \ + file/random_access_file_reader.cc \ + file/read_write_util.cc \ + file/readahead_raf.cc \ + file/sequence_file_reader.cc \ + file/sst_file_manager_impl.cc \ + file/writable_file_writer.cc \ + logging/auto_roll_logger.cc \ + logging/event_logger.cc \ + logging/log_buffer.cc \ + memory/arena.cc \ + memory/concurrent_arena.cc \ + memory/jemalloc_nodump_allocator.cc \ memtable/alloc_tracker.cc \ memtable/hash_linklist_rep.cc \ memtable/hash_skiplist_rep.cc \ @@ -75,10 +94,12 @@ LIB_SOURCES = \ memtable/write_buffer_manager.cc \ monitoring/histogram.cc \ monitoring/histogram_windowing.cc \ + monitoring/in_memory_stats_history.cc \ monitoring/instrumented_mutex.cc \ monitoring/iostats_context.cc \ monitoring/perf_context.cc \ monitoring/perf_level.cc \ + monitoring/persistent_stats_history.cc \ monitoring/statistics.cc \ monitoring/thread_status_impl.cc \ monitoring/thread_status_updater.cc \ @@ -93,75 +114,67 @@ LIB_SOURCES = \ options/options_sanity_check.cc \ port/port_posix.cc \ port/stack_trace.cc \ - table/adaptive_table_factory.cc \ - table/block.cc \ - table/block_based_filter_block.cc \ - table/block_based_table_builder.cc \ - table/block_based_table_factory.cc \ - table/block_based_table_reader.cc \ - table/block_builder.cc \ - table/block_fetcher.cc \ - table/block_prefix_index.cc \ - table/bloom_block.cc \ - table/cuckoo_table_builder.cc \ - table/cuckoo_table_factory.cc \ - table/cuckoo_table_reader.cc \ - table/data_block_hash_index.cc \ - table/data_block_footer.cc \ - table/flush_block_policy.cc \ + table/adaptive/adaptive_table_factory.cc \ + table/block_based/block.cc \ + table/block_based/block_based_filter_block.cc \ + table/block_based/block_based_table_builder.cc \ + table/block_based/block_based_table_factory.cc \ + table/block_based/block_based_table_reader.cc \ + table/block_based/block_builder.cc \ + table/block_based/block_prefix_index.cc \ + table/block_based/data_block_hash_index.cc \ + table/block_based/data_block_footer.cc \ + table/block_based/filter_block_reader_common.cc \ + table/block_based/filter_policy.cc \ + table/block_based/flush_block_policy.cc \ + table/block_based/full_filter_block.cc \ + table/block_based/index_builder.cc \ + table/block_based/parsed_full_filter_block.cc \ + table/block_based/partitioned_filter_block.cc \ + table/block_based/uncompression_dict_reader.cc \ + table/block_fetcher.cc \ + table/cuckoo/cuckoo_table_builder.cc \ + table/cuckoo/cuckoo_table_factory.cc \ + table/cuckoo/cuckoo_table_reader.cc \ table/format.cc \ - table/full_filter_block.cc \ table/get_context.cc \ - table/index_builder.cc \ table/iterator.cc \ table/merging_iterator.cc \ table/meta_blocks.cc \ - table/partitioned_filter_block.cc \ table/persistent_cache_helper.cc \ - table/plain_table_builder.cc \ - table/plain_table_factory.cc \ - table/plain_table_index.cc \ - table/plain_table_key_coding.cc \ - table/plain_table_reader.cc \ + table/plain/plain_table_bloom.cc \ + table/plain/plain_table_builder.cc \ + table/plain/plain_table_factory.cc \ + table/plain/plain_table_index.cc \ + table/plain/plain_table_key_coding.cc \ + table/plain/plain_table_reader.cc \ table/sst_file_reader.cc \ table/sst_file_writer.cc \ table/table_properties.cc \ table/two_level_iterator.cc \ + test_util/sync_point.cc \ + test_util/sync_point_impl.cc \ + test_util/transaction_test_util.cc \ tools/dump/db_dump_tool.cc \ - util/arena.cc \ - util/auto_roll_logger.cc \ - util/bloom.cc \ + trace_replay/trace_replay.cc \ + trace_replay/block_cache_tracer.cc \ util/build_version.cc \ util/coding.cc \ util/compaction_job_stats_impl.cc \ util/comparator.cc \ util/compression_context_cache.cc \ - util/concurrent_arena.cc \ util/concurrent_task_limiter_impl.cc \ util/crc32c.cc \ - util/delete_scheduler.cc \ util/dynamic_bloom.cc \ - util/event_logger.cc \ - util/file_reader_writer.cc \ - util/file_util.cc \ - util/filename.cc \ - util/filter_policy.cc \ util/hash.cc \ - util/jemalloc_nodump_allocator.cc \ - util/log_buffer.cc \ util/murmurhash.cc \ util/random.cc \ util/rate_limiter.cc \ util/slice.cc \ - util/sst_file_manager_impl.cc \ util/status.cc \ util/string_util.cc \ - util/sync_point.cc \ - util/sync_point_impl.cc \ util/thread_local.cc \ util/threadpool_imp.cc \ - util/trace_replay.cc \ - util/transaction_test_util.cc \ util/xxhash.cc \ utilities/backupable/backupable_db.cc \ utilities/blob_db/blob_compaction_filter.cc \ @@ -185,10 +198,12 @@ LIB_SOURCES = \ utilities/memory/memory_util.cc \ utilities/merge_operators/max.cc \ utilities/merge_operators/put.cc \ + utilities/merge_operators/sortlist.cc \ utilities/merge_operators/string_append/stringappend.cc \ utilities/merge_operators/string_append/stringappend2.cc \ utilities/merge_operators/uint64add.cc \ utilities/merge_operators/bytesxor.cc \ + utilities/object_registry.cc \ utilities/option_change_migration/option_change_migration.cc \ utilities/options/options_util.cc \ utilities/persistent_cache/block_cache_tier.cc \ @@ -196,6 +211,7 @@ LIB_SOURCES = \ utilities/persistent_cache/block_cache_tier_metadata.cc \ utilities/persistent_cache/persistent_cache_tier.cc \ utilities/persistent_cache/volatile_tier_impl.cc \ + utilities/simulator_cache/cache_simulator.cc \ utilities/simulator_cache/sim_cache.cc \ utilities/table_properties_collectors/compact_on_deletion_collector.cc \ utilities/trace/file_trace_reader_writer.cc \ @@ -216,6 +232,11 @@ LIB_SOURCES = \ utilities/write_batch_with_index/write_batch_with_index.cc \ utilities/write_batch_with_index/write_batch_with_index_internal.cc \ +ifeq ($(ARMCRC_SOURCE),1) +LIB_SOURCES +=\ + util/crc32c_arm64.cc +endif + ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1)) LIB_SOURCES_ASM =\ util/crc32c_ppc_asm.S @@ -226,37 +247,48 @@ LIB_SOURCES_ASM = LIB_SOURCES_C = endif -TOOL_LIB_SOURCES = \ +TOOL_LIB_SOURCES = \ tools/ldb_cmd.cc \ tools/ldb_tool.cc \ tools/sst_dump_tool.cc \ utilities/blob_db/blob_dump_tool.cc \ -ANALYZER_LIB_SOURCES = \ - tools/trace_analyzer_tool.cc \ +ANALYZER_LIB_SOURCES = \ + tools/block_cache_analyzer/block_cache_trace_analyzer.cc \ + tools/trace_analyzer_tool.cc \ -MOCK_LIB_SOURCES = \ - table/mock_table.cc \ - util/fault_injection_test_env.cc +MOCK_LIB_SOURCES = \ + table/mock_table.cc \ + test_util/fault_injection_test_env.cc -BENCH_LIB_SOURCES = \ +BENCH_LIB_SOURCES = \ tools/db_bench_tool.cc \ -TEST_LIB_SOURCES = \ +STRESS_LIB_SOURCES = \ + tools/db_stress_tool.cc \ + +TEST_LIB_SOURCES = \ db/db_test_util.cc \ - util/testharness.cc \ - util/testutil.cc \ + test_util/testharness.cc \ + test_util/testutil.cc \ utilities/cassandra/test_utils.cc \ +FOLLY_SOURCES = \ + third-party/folly/folly/detail/Futex.cpp \ + third-party/folly/folly/synchronization/AtomicNotification.cpp \ + third-party/folly/folly/synchronization/DistributedMutex.cpp \ + third-party/folly/folly/synchronization/ParkingLot.cpp \ + third-party/folly/folly/synchronization/WaitOptions.cpp \ + MAIN_SOURCES = \ cache/cache_bench.cc \ cache/cache_test.cc \ db/column_family_test.cc \ db/compact_files_test.cc \ - db/compaction_iterator_test.cc \ - db/compaction_job_stats_test.cc \ - db/compaction_job_test.cc \ - db/compaction_picker_test.cc \ + db/compaction/compaction_iterator_test.cc \ + db/compaction/compaction_job_test.cc \ + db/compaction/compaction_job_stats_test.cc \ + db/compaction/compaction_picker_test.cc \ db/comparator_db_test.cc \ db/corruption_test.cc \ db/cuckoo_table_db_test.cc \ @@ -277,10 +309,11 @@ MAIN_SOURCES = \ db/db_log_iter_test.cc \ db/db_memtable_test.cc \ db/db_merge_operator_test.cc \ + db/db_merge_operand_test.cc \ db/db_options_test.cc \ db/db_properties_test.cc \ db/db_range_del_test.cc \ - db/db_secondary_test.cc \ + db/db_impl/db_secondary_test.cc \ db/db_sst_test.cc \ db/db_statistics_test.cc \ db/db_table_properties_test.cc \ @@ -293,7 +326,7 @@ MAIN_SOURCES = \ db/dbformat_test.cc \ db/deletefile_test.cc \ db/env_timed_test.cc \ - db/error_handler_test.cc \ + db/error_handler_test.cc \ db/external_sst_file_basic_test.cc \ db/external_sst_file_test.cc \ db/fault_injection_test.cc \ @@ -314,7 +347,6 @@ MAIN_SOURCES = \ db/obsolete_files_test.cc \ db/options_settable_test.cc \ db/options_file_test.cc \ - db/partitioned_filter_block_test.cc \ db/perf_context_test.cc \ db/persistent_cache_test.cc \ db/plain_table_db_test.cc \ @@ -335,6 +367,10 @@ MAIN_SOURCES = \ env/env_basic_test.cc \ env/env_test.cc \ env/mock_env_test.cc \ + logging/auto_roll_logger_test.cc \ + logging/env_logger_test.cc \ + logging/event_logger_test.cc \ + memory/arena_test.cc \ memtable/inlineskiplist_test.cc \ memtable/memtablerep_bench.cc \ memtable/skiplist_test.cc \ @@ -342,34 +378,37 @@ MAIN_SOURCES = \ monitoring/histogram_test.cc \ monitoring/iostats_context_test.cc \ monitoring/statistics_test.cc \ + monitoring/stats_history_test.cc \ options/options_test.cc \ - table/block_based_filter_block_test.cc \ - table/block_test.cc \ + table/block_based/block_based_filter_block_test.cc \ + table/block_based/block_test.cc \ + table/block_based/data_block_hash_index_test.cc \ + table/block_based/full_filter_block_test.cc \ + table/block_based/partitioned_filter_block_test.cc \ table/cleanable_test.cc \ - table/cuckoo_table_builder_test.cc \ - table/cuckoo_table_reader_test.cc \ - table/data_block_hash_index_test.cc \ - table/full_filter_block_test.cc \ + table/cuckoo/cuckoo_table_builder_test.cc \ + table/cuckoo/cuckoo_table_reader_test.cc \ table/merger_test.cc \ table/sst_file_reader_test.cc \ table/table_reader_bench.cc \ table/table_test.cc \ third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ + tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc \ + tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc \ tools/db_bench.cc \ tools/db_bench_tool_test.cc \ tools/db_sanity_test.cc \ + tools/db_stress.cc \ tools/ldb_cmd_test.cc \ tools/reduce_levels_test.cc \ tools/sst_dump_test.cc \ - tools/trace_analyzer_test.cc \ - util/arena_test.cc \ - util/auto_roll_logger_test.cc \ + tools/trace_analyzer_test.cc \ + trace_replay/block_cache_tracer_test.cc \ util/autovector_test.cc \ util/bloom_test.cc \ util/coding_test.cc \ util/crc32c_test.cc \ util/dynamic_bloom_test.cc \ - util/event_logger_test.cc \ util/filelock_test.cc \ util/log_write_bench.cc \ util/rate_limiter_test.cc \ @@ -390,6 +429,7 @@ MAIN_SOURCES = \ utilities/object_registry_test.cc \ utilities/option_change_migration/option_change_migration_test.cc \ utilities/options/options_util_test.cc \ + utilities/simulator_cache/cache_simulator_test.cc \ utilities/simulator_cache/sim_cache_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ utilities/transactions/optimistic_transaction_test.cc \ @@ -446,6 +486,8 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/snapshot.cc \ java/rocksjni/sst_file_manager.cc \ java/rocksjni/sst_file_writerjni.cc \ + java/rocksjni/sst_file_readerjni.cc \ + java/rocksjni/sst_file_reader_iterator.cc \ java/rocksjni/statistics.cc \ java/rocksjni/statisticsjni.cc \ java/rocksjni/table.cc \ diff --git a/table/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc similarity index 98% rename from table/adaptive_table_factory.cc rename to table/adaptive/adaptive_table_factory.cc index d5dcbc5f585..0086368a9bb 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive/adaptive_table_factory.cc @@ -4,7 +4,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef ROCKSDB_LITE -#include "table/adaptive_table_factory.h" +#include "table/adaptive/adaptive_table_factory.h" #include "table/table_builder.h" #include "table/format.h" diff --git a/table/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h similarity index 100% rename from table/adaptive_table_factory.h rename to table/adaptive/adaptive_table_factory.h diff --git a/table/block.cc b/table/block_based/block.cc similarity index 93% rename from table/block.cc rename to table/block_based/block.cc index 80bef4a913f..8fa3ff9b986 100644 --- a/table/block.cc +++ b/table/block_based/block.cc @@ -9,21 +9,21 @@ // // Decodes the blocks generated by block_builder.cc. -#include "table/block.h" +#include "table/block_based/block.h" #include #include #include #include +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" -#include "table/block_prefix_index.h" -#include "table/data_block_footer.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/data_block_footer.h" #include "table/format.h" #include "util/coding.h" -#include "util/logging.h" namespace rocksdb { @@ -381,6 +381,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { } void IndexBlockIter::Seek(const Slice& target) { + TEST_SYNC_POINT("IndexBlockIter::Seek:0"); Slice seek_key = target; if (!key_includes_seq_) { seek_key = ExtractUserKey(target); @@ -607,8 +608,7 @@ bool IndexBlockIter::ParseNextIndexKey() { } // else we are in the middle of a restart interval and the restart_index_ // thus has not changed - if (value_delta_encoded_) { - assert(value_length == 0); + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { DecodeCurrentValue(shared); } return true; @@ -626,24 +626,32 @@ bool IndexBlockIter::ParseNextIndexKey() { // Otherwise the format is delta-size = block handle size - size of last block // handle. void IndexBlockIter::DecodeCurrentValue(uint32_t shared) { - assert(value_delta_encoded_); - const char* limit = data_ + restarts_; - if (shared == 0) { - uint64_t o, s; - const char* newp = GetVarint64Ptr(value_.data(), limit, &o); - assert(newp); - newp = GetVarint64Ptr(newp, limit, &s); - assert(newp); - decoded_value_ = BlockHandle(o, s); - value_ = Slice(value_.data(), newp - value_.data()); - } else { - uint64_t next_value_base = - decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize; - int64_t delta; - const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta); - decoded_value_ = - BlockHandle(next_value_base, decoded_value_.size() + delta); - value_ = Slice(value_.data(), newp - value_.data()); + Slice v(value_.data(), data_ + restarts_ - value_.data()); + // Delta encoding is used if `shared` != 0. + Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom( + &v, have_first_key_, + (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr); + assert(decode_s.ok()); + value_ = Slice(value_.data(), v.data() - value_.data()); + + if (global_seqno_state_ != nullptr) { + // Overwrite sequence number the same way as in DataBlockIter. + + IterKey& first_internal_key = global_seqno_state_->first_internal_key; + first_internal_key.SetInternalKey(decoded_value_.first_internal_key, + /* copy */ true); + + assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0); + + ValueType value_type = ExtractValueType(first_internal_key.GetKey()); + assert(value_type == ValueType::kTypeValue || + value_type == ValueType::kTypeMerge || + value_type == ValueType::kTypeDeletion || + value_type == ValueType::kTypeRangeDeletion); + + first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, + value_type); + decoded_value_.first_internal_key = first_internal_key.GetKey(); } } @@ -874,14 +882,10 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, } } -template <> -DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, - DataBlockIter* iter, Statistics* stats, - bool /*total_order_seek*/, - bool /*key_includes_seq*/, - bool /*value_is_full*/, - bool block_contents_pinned, - BlockPrefixIndex* /*prefix_index*/) { +DataBlockIter* Block::NewDataIterator(const Comparator* cmp, + const Comparator* ucmp, + DataBlockIter* iter, Statistics* stats, + bool block_contents_pinned) { DataBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -912,13 +916,11 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp, return ret_iter; } -template <> -IndexBlockIter* Block::NewIterator(const Comparator* cmp, - const Comparator* ucmp, IndexBlockIter* iter, - Statistics* /*stats*/, bool total_order_seek, - bool key_includes_seq, bool value_is_full, - bool block_contents_pinned, - BlockPrefixIndex* prefix_index) { +IndexBlockIter* Block::NewIndexIterator( + const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter, + Statistics* /*stats*/, bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, bool block_contents_pinned, + BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -937,9 +939,9 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp, BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_, - prefix_index_ptr, key_includes_seq, value_is_full, - block_contents_pinned, - nullptr /* data_block_hash_index */); + global_seqno_, prefix_index_ptr, have_first_key, + key_includes_seq, value_is_full, + block_contents_pinned); } return ret_iter; diff --git a/table/block.h b/table/block_based/block.h similarity index 81% rename from table/block.h rename to table/block_based/block.h index df4d4eb82fc..73c21b4659a 100644 --- a/table/block.h +++ b/table/block_based/block.h @@ -12,26 +12,20 @@ #include #include #include -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" -#include "format.h" +#include "port/malloc.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/table.h" -#include "table/block_prefix_index.h" -#include "table/data_block_hash_index.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/data_block_hash_index.h" +#include "table/format.h" #include "table/internal_iterator.h" +#include "test_util/sync_point.h" #include "util/random.h" -#include "util/sync_point.h" namespace rocksdb { @@ -141,12 +135,27 @@ class BlockReadAmpBitmap { uint32_t rnd_; }; +// This Block class is not for any old block: it is designed to hold only +// uncompressed blocks containing sorted key-value pairs. It is thus +// suitable for storing uncompressed data blocks, index blocks (including +// partitions), range deletion blocks, properties blocks, metaindex blocks, +// as well as the top level of the partitioned filter structure (which is +// actually an index of the filter partitions). It is NOT suitable for +// compressed blocks in general, filter blocks/partitions, or compression +// dictionaries (since the latter do not contain sorted key-value pairs). +// Use BlockContents directly for those. +// +// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format +// for details of the format and the various block types. class Block { public: // Initialize the block with the specified contents. explicit Block(BlockContents&& contents, SequenceNumber _global_seqno, size_t read_amp_bytes_per_bit = 0, Statistics* statistics = nullptr); + // No copying allowed + Block(const Block&) = delete; + void operator=(const Block&) = delete; ~Block(); @@ -165,17 +174,7 @@ class Block { // If iter is null, return new Iterator // If iter is not null, update this one and return it as Iterator* // - // key_includes_seq, default true, means that the keys are in internal key - // format. - // value_is_full, default true, means that no delta encoding is - // applied to values. - // - // NewIterator - // Same as above but also updates read_amp_bitmap_ if it is not nullptr. - // - // NewIterator - // If `prefix_index` is not nullptr this block will do hash lookup for the key - // prefix. If total_order_seek is true, prefix_index_ is ignored. + // Updates read_amp_bitmap_ if it is not nullptr. // // If `block_contents_pinned` is true, the caller will guarantee that when // the cleanup functions are transferred from the iterator to other @@ -188,13 +187,32 @@ class Block { // NOTE: for the hash based lookup, if a key prefix doesn't match any key, // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. - template - TBlockIter* NewIterator( - const Comparator* comparator, const Comparator* user_comparator, - TBlockIter* iter = nullptr, Statistics* stats = nullptr, - bool total_order_seek = true, bool key_includes_seq = true, - bool value_is_full = true, bool block_contents_pinned = false, - BlockPrefixIndex* prefix_index = nullptr); + + DataBlockIter* NewDataIterator(const Comparator* comparator, + const Comparator* user_comparator, + DataBlockIter* iter = nullptr, + Statistics* stats = nullptr, + bool block_contents_pinned = false); + + // key_includes_seq, default true, means that the keys are in internal key + // format. + // value_is_full, default true, means that no delta encoding is + // applied to values. + // + // If `prefix_index` is not nullptr this block will do hash lookup for the key + // prefix. If total_order_seek is true, prefix_index_ is ignored. + // + // `have_first_key` controls whether IndexValue will contain + // first_internal_key. It affects data serialization format, so the same value + // have_first_key must be used when writing and reading index. + // It is determined by IndexType property of the table. + IndexBlockIter* NewIndexIterator(const Comparator* comparator, + const Comparator* user_comparator, + IndexBlockIter* iter, Statistics* stats, + bool total_order_seek, bool have_first_key, + bool key_includes_seq, bool value_is_full, + bool block_contents_pinned = false, + BlockPrefixIndex* prefix_index = nullptr); // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; @@ -213,10 +231,6 @@ class Block { const SequenceNumber global_seqno_; DataBlockHashIndex data_block_hash_index_; - - // No copying allowed - Block(const Block&) = delete; - void operator=(const Block&) = delete; }; template @@ -236,6 +250,7 @@ class BlockIter : public InternalIteratorBase { restart_index_ = num_restarts_; global_seqno_ = global_seqno; block_contents_pinned_ = block_contents_pinned; + cache_handle_ = nullptr; } // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do @@ -285,6 +300,10 @@ class BlockIter : public InternalIteratorBase { return static_cast(value_.data() - data_); } + void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; } + + Cache::Handle* cache_handle() { return cache_handle_; } + protected: // Note: The type could be changed to InternalKeyComparator but we see a weird // performance drop by that. @@ -307,6 +326,14 @@ class BlockIter : public InternalIteratorBase { bool block_contents_pinned_; SequenceNumber global_seqno_; + private: + // Store the cache handle, if the block is cached. We need this since the + // only other place the handle is stored is as an argument to the Cleanable + // function callback, which is hard to retrieve. When multiple value + // PinnableSlices reference the block, they need the cache handle in order + // to bump up the ref count + Cache::Handle* cache_handle_; + public: // Return the offset in data_ just past the end of the current entry. inline uint32_t NextEntryOffset() const { @@ -458,7 +485,7 @@ class DataBlockIter final : public BlockIter { bool SeekForGetImpl(const Slice& target); }; -class IndexBlockIter final : public BlockIter { +class IndexBlockIter final : public BlockIter { public: IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {} @@ -470,23 +497,12 @@ class IndexBlockIter final : public BlockIter { // format. // value_is_full, default true, means that no delta encoding is // applied to values. - IndexBlockIter(const Comparator* comparator, - const Comparator* user_comparator, const char* data, - uint32_t restarts, uint32_t num_restarts, - BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned) - : IndexBlockIter() { - Initialize(comparator, user_comparator, data, restarts, num_restarts, - prefix_index, key_includes_seq, block_contents_pinned, - value_is_full, nullptr /* data_block_hash_index */); - } - void Initialize(const Comparator* comparator, const Comparator* user_comparator, const char* data, uint32_t restarts, uint32_t num_restarts, - BlockPrefixIndex* prefix_index, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned, - DataBlockHashIndex* /*data_block_hash_index*/) { + SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, + bool have_first_key, bool key_includes_seq, + bool value_is_full, bool block_contents_pinned) { InitializeBase(key_includes_seq ? comparator : user_comparator, data, restarts, num_restarts, kDisableGlobalSequenceNumber, block_contents_pinned); @@ -494,6 +510,12 @@ class IndexBlockIter final : public BlockIter { key_.SetIsUserKey(!key_includes_seq_); prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; + have_first_key_ = have_first_key; + if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) { + global_seqno_state_.reset(new GlobalSeqnoState(global_seqno)); + } else { + global_seqno_state_.reset(); + } } Slice user_key() const override { @@ -503,16 +525,17 @@ class IndexBlockIter final : public BlockIter { return key(); } - virtual BlockHandle value() const override { + virtual IndexValue value() const override { assert(Valid()); - if (value_delta_encoded_) { + if (value_delta_encoded_ || global_seqno_state_ != nullptr) { return decoded_value_; } else { - BlockHandle handle; + IndexValue entry; Slice v = value_; - Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v); + Status decode_s __attribute__((__unused__)) = + entry.DecodeFrom(&v, have_first_key_, nullptr); assert(decode_s.ok()); - return handle; + return entry; } } @@ -539,10 +562,15 @@ class IndexBlockIter final : public BlockIter { void Invalidate(Status s) { InvalidateBase(s); } + bool IsValuePinned() const override { + return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); + } + private: // Key is in InternalKey format bool key_includes_seq_; bool value_delta_encoded_; + bool have_first_key_; // value includes first_internal_key BlockPrefixIndex* prefix_index_; // Whether the value is delta encoded. In that case the value is assumed to be // BlockHandle. The first value in each restart interval is the full encoded @@ -550,7 +578,22 @@ class IndexBlockIter final : public BlockIter { // offset of delta encoded BlockHandles is computed by adding the size of // previous delta encoded values in the same restart interval to the offset of // the first value in that restart interval. - BlockHandle decoded_value_; + IndexValue decoded_value_; + + // When sequence number overwriting is enabled, this struct contains the seqno + // to overwrite with, and current first_internal_key with overwritten seqno. + // This is rarely used, so we put it behind a pointer and only allocate when + // needed. + struct GlobalSeqnoState { + // First internal key according to current index entry, but with sequence + // number overwritten to global_seqno. + IterKey first_internal_key; + SequenceNumber global_seqno; + + explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {} + }; + + std::unique_ptr global_seqno_state_; bool PrefixSeek(const Slice& target, uint32_t* index); bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, diff --git a/table/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc similarity index 57% rename from table/block_based_filter_block.cc rename to table/block_based/block_based_filter_block.cc index 81087b243b7..319c5bf6d87 100644 --- a/table/block_based_filter_block.cc +++ b/table/block_based/block_based_filter_block.cc @@ -7,12 +7,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_filter_block.h" +#include "table/block_based/block_based_filter_block.h" #include #include "db/dbformat.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "util/coding.h" #include "util/string_util.h" @@ -162,56 +163,120 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() { } BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( - const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, bool _whole_key_filtering, - BlockContents&& contents, Statistics* stats) - : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), - policy_(table_opt.filter_policy.get()), - prefix_extractor_(prefix_extractor), - data_(nullptr), - offset_(nullptr), - num_(0), - base_lg_(0), - contents_(std::move(contents)) { - assert(policy_); - size_t n = contents_.data.size(); - if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array - base_lg_ = contents_.data[n - 1]; - uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5); - if (last_word > n - 5) return; - data_ = contents_.data.data(); - offset_ = data_ + last_word; - num_ = (n - 5 - last_word) / 4; + const BlockBasedTable* t, CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) { + assert(table()); + assert(table()->get_rep()); + assert(table()->get_rep()->filter_policy); +} + +std::unique_ptr BlockBasedFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + use_cache, nullptr /* get_context */, + lookup_context, &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new BlockBasedFilterBlockReader(table, std::move(filter_block))); } bool BlockBasedFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { assert(block_offset != kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { return true; } - return MayMatch(key, block_offset); + return MayMatch(key, block_offset, no_io, get_context, lookup_context); } bool BlockBasedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { assert(block_offset != kNotValid); - return MayMatch(prefix, block_offset); -} - -bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, - uint64_t block_offset) { - uint64_t index = block_offset >> base_lg_; - if (index < num_) { - uint32_t start = DecodeFixed32(offset_ + index * 4); - uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); - if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { - Slice filter = Slice(data_ + start, limit - start); - bool const may_match = policy_->KeyMayMatch(entry, filter); + return MayMatch(prefix, block_offset, no_io, get_context, lookup_context); +} + +bool BlockBasedFilterBlockReader::ParseFieldsFromBlock( + const BlockContents& contents, const char** data, const char** offset, + size_t* num, size_t* base_lg) { + assert(data); + assert(offset); + assert(num); + assert(base_lg); + + const size_t n = contents.data.size(); + if (n < 5) { // 1 byte for base_lg and 4 for start of offset array + return false; + } + + const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5); + if (last_word > n - 5) { + return false; + } + + *data = contents.data.data(); + *offset = (*data) + last_word; + *num = (n - 5 - last_word) / 4; + *base_lg = contents.data[n - 1]; + + return true; +} + +bool BlockBasedFilterBlockReader::MayMatch( + const Slice& entry, uint64_t block_offset, bool no_io, + GetContext* get_context, BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + return true; + } + + assert(filter_block.GetValue()); + + const char* data = nullptr; + const char* offset = nullptr; + size_t num = 0; + size_t base_lg = 0; + if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num, + &base_lg)) { + return true; // Errors are treated as potential matches + } + + const uint64_t index = block_offset >> base_lg; + if (index < num) { + const uint32_t start = DecodeFixed32(offset + index * 4); + const uint32_t limit = DecodeFixed32(offset + index * 4 + 4); + if (start <= limit && limit <= (uint32_t)(offset - data)) { + const Slice filter = Slice(data + start, limit - start); + + assert(table()); + assert(table()->get_rep()); + const FilterPolicy* const policy = table()->get_rep()->filter_policy; + + const bool may_match = policy->KeyMayMatch(entry, filter); if (may_match) { PERF_COUNTER_ADD(bloom_sst_hit_count, 1); return true; @@ -228,27 +293,54 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, } size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const { - return num_ * 4 + 5 + (offset_ - data_); + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; } std::string BlockBasedFilterBlockReader::ToString() const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + nullptr /* lookup_context */, &filter_block); + if (!s.ok()) { + return std::string("Unable to retrieve filter block"); + } + + assert(filter_block.GetValue()); + + const char* data = nullptr; + const char* offset = nullptr; + size_t num = 0; + size_t base_lg = 0; + if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num, + &base_lg)) { + return std::string("Error parsing filter block"); + } + std::string result; result.reserve(1024); std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks"); - AppendItem(&result, s_fb, rocksdb::ToString(num_)); + AppendItem(&result, s_fb, rocksdb::ToString(num)); AppendItem(&result, s_bo, s_hd); - for (size_t index = 0; index < num_; index++) { - uint32_t start = DecodeFixed32(offset_ + index * 4); - uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); + for (size_t index = 0; index < num; index++) { + uint32_t start = DecodeFixed32(offset + index * 4); + uint32_t limit = DecodeFixed32(offset + index * 4 + 4); if (start != limit) { result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n"); - Slice filter = Slice(data_ + start, limit - start); + Slice filter = Slice(data + start, limit - start); AppendItem(&result, start, filter.ToString(true)); } } return result; } + } // namespace rocksdb diff --git a/table/block_based_filter_block.h b/table/block_based/block_based_filter_block.h similarity index 64% rename from table/block_based_filter_block.h rename to table/block_based/block_based_filter_block.h index d1ff585462a..ed409e041ee 100644 --- a/table/block_based_filter_block.h +++ b/table/block_based/block_based_filter_block.h @@ -18,10 +18,12 @@ #include #include #include + #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "table/filter_block.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/format.h" #include "util/hash.h" namespace rocksdb { @@ -36,6 +38,9 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { public: BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor, const BlockBasedTableOptions& table_opt); + // No copying allowed + BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&) = delete; + void operator=(const BlockBasedFilterBlockBuilder&) = delete; virtual bool IsBlockBased() override { return true; } virtual void StartBlock(uint64_t block_offset) override; @@ -66,49 +71,49 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { std::vector tmp_entries_; // policy_->CreateFilter() argument std::vector filter_offsets_; size_t num_added_; // Number of keys added - - // No copying allowed - BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&); - void operator=(const BlockBasedFilterBlockBuilder&); }; // A FilterBlockReader is used to parse filter from SST table. // KeyMayMatch and PrefixMayMatch would trigger filter checking -class BlockBasedFilterBlockReader : public FilterBlockReader { +class BlockBasedFilterBlockReader + : public FilterBlockReaderCommon { public: - // REQUIRES: "contents" and *policy must stay live while *this is live. - BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor, - const BlockBasedTableOptions& table_opt, - bool whole_key_filtering, - BlockContents&& contents, Statistics* statistics); - virtual bool IsBlockBased() override { return true; } + BlockBasedFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + // No copying allowed + BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&) = delete; + void operator=(const BlockBasedFilterBlockReader&) = delete; + + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); - virtual bool KeyMayMatch( - const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - virtual bool PrefixMayMatch( - const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - virtual size_t ApproximateMemoryUsage() const override; + bool IsBlockBased() override { return true; } + + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + size_t ApproximateMemoryUsage() const override; // convert this object to a human readable form std::string ToString() const override; private: - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - const char* data_; // Pointer to filter data (at block-start) - const char* offset_; // Pointer to beginning of offset array (at block-end) - size_t num_; // Number of entries in offset array - size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) - BlockContents contents_; + static bool ParseFieldsFromBlock(const BlockContents& contents, + const char** data, const char** offset, + size_t* num, size_t* base_lg); - bool MayMatch(const Slice& entry, uint64_t block_offset); - - // No copying allowed - BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&); - void operator=(const BlockBasedFilterBlockReader&); + bool MayMatch(const Slice& entry, uint64_t block_offset, bool no_io, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; }; + } // namespace rocksdb diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc new file mode 100644 index 00000000000..677203ae24d --- /dev/null +++ b/table/block_based/block_based_filter_block_test.cc @@ -0,0 +1,434 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based/block_based_filter_block.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/mock_block_based_table.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace rocksdb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + const char* Name() const override { return "TestHashFilter"; } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} +}; + +class FilterBlockTest : public mock::MockBlockBasedTableTester, + public testing::Test { + public: + FilterBlockTest() : mock::MockBlockBasedTableTester(new TestHashFilter) {} +}; + +TEST_F(FilterBlockTest, EmptyBuilder) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + Slice slice(builder.Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice)); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +TEST_F(FilterBlockTest, SingleChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + ASSERT_EQ(0, builder.NumAdded()); + builder.StartBlock(100); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.StartBlock(200); + builder.Add("box"); + builder.StartBlock(300); + builder.Add("hello"); + ASSERT_EQ(5, builder.NumAdded()); + Slice slice(builder.Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +TEST_F(FilterBlockTest, MultiChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + + // First filter + builder.StartBlock(0); + builder.Add("foo"); + builder.StartBlock(2000); + builder.Add("bar"); + + // Second filter + builder.StartBlock(3100); + builder.Add("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.Add("box"); + builder.Add("hello"); + + Slice slice(builder.Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + BlockBasedFilterBlockReader reader(table_.get(), std::move(block)); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/2000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check third filter (empty) + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +// Test for block based filter block +// use new interface in FilterPolicy to create filter builder/reader +class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester, + public testing::Test { + public: + BlockBasedFilterBlockTest() + : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, true)) {} +}; + +TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + Slice slice(builder->Finish()); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice)); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + delete builder; + delete reader; +} + +TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + builder->StartBlock(100); + builder->Add("foo"); + builder->Add("bar"); + builder->Add("box"); + builder->StartBlock(200); + builder->Add("box"); + builder->StartBlock(300); + builder->Add("hello"); + Slice slice(builder->Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + delete builder; + delete reader; +} + +TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { + FilterBlockBuilder* builder = + new BlockBasedFilterBlockBuilder(nullptr, table_options_); + + // First filter + builder->StartBlock(0); + builder->Add("foo"); + builder->StartBlock(2000); + builder->Add("bar"); + + // Second filter + builder->StartBlock(3100); + builder->Add("box"); + + // Third filter is empty + + // Last filter + builder->StartBlock(9000); + builder->Add("box"); + builder->Add("hello"); + + Slice slice(builder->Finish()); + + CachableEntry block( + new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + + FilterBlockReader* reader = + new BlockBasedFilterBlockReader(table_.get(), std::move(block)); + + // Check first filter + ASSERT_TRUE(reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0}, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check second filter + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check third filter (empty) + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + // Check last filter + ASSERT_TRUE(reader->KeyMayMatch( + "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader->KeyMayMatch( + "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader->KeyMayMatch( + "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + + delete builder; + delete reader; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/table/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc similarity index 94% rename from table/block_based_table_builder.cc rename to table/block_based/block_based_table_builder.cc index 9a1742e5f3a..58f3ff4339a 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_table_builder.h" +#include "table/block_based/block_based_table_builder.h" #include #include @@ -20,36 +20,35 @@ #include #include "db/dbformat.h" +#include "index_builder.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" -#include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/merge_operator.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_based_filter_block.h" -#include "table/block_based_table_factory.h" -#include "table/block_based_table_reader.h" -#include "table/block_builder.h" -#include "table/filter_block.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" #include "table/format.h" -#include "table/full_filter_block.h" #include "table/table_builder.h" +#include "memory/memory_allocator.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" -#include "util/memory_allocator.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/xxhash.h" -#include "table/index_builder.h" -#include "table/partitioned_filter_block.h" - namespace rocksdb { extern const std::string kHashIndexPrefixesBlock; @@ -63,13 +62,14 @@ namespace { // Create a filter block builder based on its type. FilterBlockBuilder* CreateFilterBlockBuilder( const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, - const BlockBasedTableOptions& table_opt, + const FilterBuildingContext& context, const bool use_delta_encoding_for_index_values, PartitionedIndexBuilder* const p_index_builder) { + const BlockBasedTableOptions& table_opt = context.table_options; if (table_opt.filter_policy == nullptr) return nullptr; FilterBitsBuilder* filter_bits_builder = - table_opt.filter_policy->GetFilterBitsBuilder(); + BloomFilterPolicy::GetBuilderFromContext(context); if (filter_bits_builder == nullptr) { return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(), table_opt); @@ -346,6 +346,7 @@ struct BlockBasedTableBuilder::Rep { std::string compressed_output; std::unique_ptr flush_block_policy; + int level_at_creation; uint32_t column_family_id; const std::string& column_family_name; uint64_t creation_time = 0; @@ -364,9 +365,9 @@ struct BlockBasedTableBuilder::Rep { const CompressionType _compression_type, const uint64_t _sample_for_compression, const CompressionOptions& _compression_opts, const bool skip_filters, - const std::string& _column_family_name, const uint64_t _creation_time, - const uint64_t _oldest_key_time, const uint64_t _target_file_size, - const uint64_t _file_creation_time) + const int _level_at_creation, const std::string& _column_family_name, + const uint64_t _creation_time, const uint64_t _oldest_key_time, + const uint64_t _target_file_size, const uint64_t _file_creation_time) : ioptions(_ioptions), moptions(_moptions), table_options(table_opt), @@ -399,6 +400,7 @@ struct BlockBasedTableBuilder::Rep { flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( table_options, data_block)), + level_at_creation(_level_at_creation), column_family_id(_column_family_id), column_family_name(_column_family_name), creation_time(_creation_time), @@ -420,9 +422,13 @@ struct BlockBasedTableBuilder::Rep { if (skip_filters) { filter_builder = nullptr; } else { + FilterBuildingContext context(table_options); + context.column_family_name = column_family_name; + context.compaction_style = ioptions.compaction_style; + context.level_at_creation = level_at_creation; filter_builder.reset(CreateFilterBlockBuilder( - _ioptions, _moptions, table_options, - use_delta_encoding_for_index_values, p_index_builder_)); + ioptions, moptions, context, use_delta_encoding_for_index_values, + p_index_builder_)); } for (auto& collector_factories : *int_tbl_prop_collector_factories) { @@ -455,9 +461,9 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( const CompressionType compression_type, const uint64_t sample_for_compression, const CompressionOptions& compression_opts, const bool skip_filters, - const std::string& column_family_name, const uint64_t creation_time, - const uint64_t oldest_key_time, const uint64_t target_file_size, - const uint64_t file_creation_time) { + const std::string& column_family_name, const int level_at_creation, + const uint64_t creation_time, const uint64_t oldest_key_time, + const uint64_t target_file_size, const uint64_t file_creation_time) { BlockBasedTableOptions sanitized_table_options(table_options); if (sanitized_table_options.format_version == 0 && sanitized_table_options.checksum != kCRC32c) { @@ -470,12 +476,12 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( sanitized_table_options.format_version = 1; } - rep_ = - new Rep(ioptions, moptions, sanitized_table_options, internal_comparator, - int_tbl_prop_collector_factories, column_family_id, file, - compression_type, sample_for_compression, compression_opts, - skip_filters, column_family_name, creation_time, oldest_key_time, - target_file_size, file_creation_time); + rep_ = new Rep(ioptions, moptions, sanitized_table_options, + internal_comparator, int_tbl_prop_collector_factories, + column_family_id, file, compression_type, + sample_for_compression, compression_opts, skip_filters, + level_at_creation, column_family_name, creation_time, + oldest_key_time, target_file_size, file_creation_time); if (rep_->filter_builder != nullptr) { rep_->filter_builder->StartBlock(0); @@ -532,7 +538,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { // Note: PartitionedFilterBlockBuilder requires key being added to filter // builder after being added to index builder. if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) { - r->filter_builder->Add(ExtractUserKey(key)); + size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size(); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); } r->last_key.assign(key.data(), key.size()); @@ -733,11 +740,13 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, break; } case kxxHash: { - void* xxh = XXH32_init(0); - XXH32_update(xxh, block_contents.data(), + XXH32_state_t* const state = XXH32_createState(); + XXH32_reset(state, 0); + XXH32_update(state, block_contents.data(), static_cast(block_contents.size())); - XXH32_update(xxh, trailer, 1); // Extend to cover block type - EncodeFixed32(trailer_without_type, XXH32_digest(xxh)); + XXH32_update(state, trailer, 1); // Extend to cover block type + EncodeFixed32(trailer_without_type, XXH32_digest(state)); + XXH32_freeState(state); break; } case kxxHash64: { diff --git a/table/block_based_table_builder.h b/table/block_based/block_based_table_builder.h similarity index 96% rename from table/block_based_table_builder.h rename to table/block_based/block_based_table_builder.h index a1ef3889112..5dd5065bb20 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -47,17 +47,18 @@ class BlockBasedTableBuilder : public TableBuilder { const CompressionType compression_type, const uint64_t sample_for_compression, const CompressionOptions& compression_opts, const bool skip_filters, - const std::string& column_family_name, const uint64_t creation_time = 0, - const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0, + const std::string& column_family_name, const int level_at_creation, + const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0, + const uint64_t target_file_size = 0, const uint64_t file_creation_time = 0); - // REQUIRES: Either Finish() or Abandon() has been called. - ~BlockBasedTableBuilder(); - // No copying allowed BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete; BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete; + // REQUIRES: Either Finish() or Abandon() has been called. + ~BlockBasedTableBuilder(); + // Add key,value to the table being constructed. // REQUIRES: key is after any previously added key according to comparator. // REQUIRES: Finish(), Abandon() have not been called diff --git a/table/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc similarity index 95% rename from table/block_based_table_factory.cc rename to table/block_based/block_based_table_factory.cc index 47fe8e1b0e3..a3368610dc8 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -7,14 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_table_factory.h" - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include #include +#include #include #include @@ -24,8 +18,9 @@ #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/flush_block_policy.h" -#include "table/block_based_table_builder.h" -#include "table/block_based_table_reader.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" #include "table/format.h" #include "util/mutexlock.h" #include "util/string_util.h" @@ -172,7 +167,12 @@ BlockBasedTableFactory::BlockBasedTableFactory( if (table_options_.no_block_cache) { table_options_.block_cache.reset(); } else if (table_options_.block_cache == nullptr) { - table_options_.block_cache = NewLRUCache(8 << 20); + LRUCacheOptions co; + co.capacity = 8 << 20; + // It makes little sense to pay overhead for mid-point insertion while the + // block size is only 8MB. + co.high_pri_pool_ratio = 0.0; + table_options_.block_cache = NewLRUCache(co); } if (table_options_.block_size_deviation < 0 || table_options_.block_size_deviation > 100) { @@ -203,7 +203,8 @@ Status BlockBasedTableFactory::NewTableReader( file_size, table_reader, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, - table_reader_options.largest_seqno, &tail_prefetch_stats_); + table_reader_options.largest_seqno, &tail_prefetch_stats_, + table_reader_options.block_cache_tracer); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( @@ -217,7 +218,7 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder( table_builder_options.sample_for_compression, table_builder_options.compression_opts, table_builder_options.skip_filters, - table_builder_options.column_family_name, + table_builder_options.column_family_name, table_builder_options.level, table_builder_options.creation_time, table_builder_options.oldest_key_time, table_builder_options.target_file_size, @@ -227,7 +228,7 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder( } Status BlockBasedTableFactory::SanitizeOptions( - const DBOptions& /*db_opts*/, const ColumnFamilyOptions& cf_opts) const { + const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && cf_opts.prefix_extractor == nullptr) { return Status::InvalidArgument( @@ -261,6 +262,10 @@ Status BlockBasedTableFactory::SanitizeOptions( return Status::InvalidArgument( "Block alignment requested but block size is not a power of 2"); } + if (table_options_.block_size > port::kMaxUint32) { + return Status::InvalidArgument( + "block size exceeds maximum number (4GiB) allowed"); + } if (table_options_.data_block_index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash && table_options_.data_block_hash_table_util_ratio <= 0) { @@ -268,6 +273,12 @@ Status BlockBasedTableFactory::SanitizeOptions( "data_block_hash_table_util_ratio should be greater than 0 when " "data_block_index_type is set to kDataBlockBinaryAndHash"); } + if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) { + // TODO(myabandeh): support it + return Status::InvalidArgument( + "max_successive_merges larger than 0 is currently inconsistent with " + "unordered_write"); + } return Status::OK(); } @@ -499,8 +510,8 @@ std::string ParseBlockBasedTableOption(const std::string& name, if (pos == std::string::npos) { return "Invalid filter policy config, missing bits_per_key"; } - int bits_per_key = - ParseInt(trim(value.substr(kName.size(), pos - kName.size()))); + double bits_per_key = + ParseDouble(trim(value.substr(kName.size(), pos - kName.size()))); bool use_block_based_builder = ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1))); new_options->filter_policy.reset( diff --git a/table/block_based_table_factory.h b/table/block_based/block_based_table_factory.h similarity index 100% rename from table/block_based_table_factory.h rename to table/block_based/block_based_table_factory.h diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc new file mode 100644 index 00000000000..ffdbdb1bd5d --- /dev/null +++ b/table/block_based/block_based_table_reader.cc @@ -0,0 +1,4409 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "table/block_based/block_based_table_reader.h" + +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" + +#include "file/file_prefetch_buffer.h" +#include "file/random_access_file_reader.h" + +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/iterator.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" + +#include "table/block_based/block.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_prefix_index.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/block_fetcher.h" +#include "table/format.h" +#include "table/get_context.h" +#include "table/internal_iterator.h" +#include "table/meta_blocks.h" +#include "table/multiget_context.h" +#include "table/persistent_cache_helper.h" +#include "table/sst_file_writer_collectors.h" +#include "table/two_level_iterator.h" + +#include "monitoring/perf_context_imp.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/stop_watch.h" +#include "util/string_util.h" +#include "util/xxhash.h" + +namespace rocksdb { + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const std::string kHashIndexPrefixesBlock; +extern const std::string kHashIndexPrefixesMetadataBlock; + +typedef BlockBasedTable::IndexReader IndexReader; + +// Found that 256 KB readahead size provides the best performance, based on +// experiments, for auto readahead. Experiment data is in PR #3282. +const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024; + +BlockBasedTable::~BlockBasedTable() { + delete rep_; +} + +std::atomic BlockBasedTable::next_cache_key_id_(0); + +template +class BlocklikeTraits; + +template <> +class BlocklikeTraits { + public: + static BlockContents* Create(BlockContents&& contents, + SequenceNumber /* global_seqno */, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool /* using_zstd */, + const FilterPolicy* /* filter_policy */) { + return new BlockContents(std::move(contents)); + } + + static uint32_t GetNumRestarts(const BlockContents& /* contents */) { + return 0; + } +}; + +template <> +class BlocklikeTraits { + public: + static ParsedFullFilterBlock* Create(BlockContents&& contents, + SequenceNumber /* global_seqno */, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool /* using_zstd */, + const FilterPolicy* filter_policy) { + return new ParsedFullFilterBlock(filter_policy, std::move(contents)); + } + + static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) { + return 0; + } +}; + +template <> +class BlocklikeTraits { + public: + static Block* Create(BlockContents&& contents, SequenceNumber global_seqno, + size_t read_amp_bytes_per_bit, Statistics* statistics, + bool /* using_zstd */, + const FilterPolicy* /* filter_policy */) { + return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit, + statistics); + } + + static uint32_t GetNumRestarts(const Block& block) { + return block.NumRestarts(); + } +}; + +template <> +class BlocklikeTraits { + public: + static UncompressionDict* Create(BlockContents&& contents, + SequenceNumber /* global_seqno */, + size_t /* read_amp_bytes_per_bit */, + Statistics* /* statistics */, + bool using_zstd, + const FilterPolicy* /* filter_policy */) { + return new UncompressionDict(contents.data, std::move(contents.allocation), + using_zstd); + } + + static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) { + return 0; + } +}; + +namespace { +// Read the block identified by "handle" from "file". +// The only relevant option is options.verify_checksums for now. +// On failure return non-OK. +// On success fill *result and return OK - caller owns *result +// @param uncompression_dict Data for presetting the compression library's +// dictionary. +template +Status ReadBlockFromFile( + RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, + std::unique_ptr* result, const ImmutableCFOptions& ioptions, + bool do_uncompress, bool maybe_compressed, BlockType block_type, + const UncompressionDict& uncompression_dict, + const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, + size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, + bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) { + assert(result); + + BlockContents contents; + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, options, handle, &contents, ioptions, + do_uncompress, maybe_compressed, block_type, uncompression_dict, + cache_options, memory_allocator, nullptr, for_compaction); + Status s = block_fetcher.ReadBlockContents(); + if (s.ok()) { + result->reset(BlocklikeTraits::Create( + std::move(contents), global_seqno, read_amp_bytes_per_bit, + ioptions.statistics, using_zstd, filter_policy)); + } + + return s; +} + +inline MemoryAllocator* GetMemoryAllocator( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache.get() + ? table_options.block_cache->memory_allocator() + : nullptr; +} + +inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( + const BlockBasedTableOptions& table_options) { + return table_options.block_cache_compressed.get() + ? table_options.block_cache_compressed->memory_allocator() + : nullptr; +} + +// Delete the entry resided in the cache. +template +void DeleteCachedEntry(const Slice& /*key*/, void* value) { + auto entry = reinterpret_cast(value); + delete entry; +} + +// Release the cached entry and decrement its ref count. +void ForceReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle, true /* force_erase */); +} + +// Release the cached entry and decrement its ref count. +// Do not force erase +void ReleaseCachedEntry(void* arg, void* h) { + Cache* cache = reinterpret_cast(arg); + Cache::Handle* handle = reinterpret_cast(h); + cache->Release(handle, false /* force_erase */); +} + +// For hash based index, return true if prefix_extractor and +// prefix_extractor_block mismatch, false otherwise. This flag will be used +// as total_order_seek via NewIndexIterator +bool PrefixExtractorChanged(const TableProperties* table_properties, + const SliceTransform* prefix_extractor) { + // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set. + // Turn off hash index in prefix_extractor is not set; if prefix_extractor + // is set but prefix_extractor_block is not set, also disable hash index + if (prefix_extractor == nullptr || table_properties == nullptr || + table_properties->prefix_extractor_name.empty()) { + return true; + } + + // prefix_extractor and prefix_extractor_block are both non-empty + if (table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) != 0) { + return true; + } else { + return false; + } +} + +CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) { + CacheAllocationPtr heap_buf; + heap_buf = AllocateBlock(buf.size(), allocator); + memcpy(heap_buf.get(), buf.data(), buf.size()); + return heap_buf; +} + +} // namespace + +// Encapsulates common functionality for the various index reader +// implementations. Provides access to the index block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { + public: + IndexReaderCommon(const BlockBasedTable* t, + CachableEntry&& index_block) + : table_(t), index_block_(std::move(index_block)) { + assert(table_ != nullptr); + } + + protected: + static Status ReadIndexBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block); + + const BlockBasedTable* table() const { return table_; } + + const InternalKeyComparator* internal_comparator() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + + return &table_->get_rep()->internal_comparator; + } + + bool index_has_first_key() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_has_first_key; + } + + bool index_key_includes_seq() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_key_includes_seq; + } + + bool index_value_is_full() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->index_value_is_full; + } + + bool cache_index_blocks() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->table_options.cache_index_and_filter_blocks; + } + + Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block) const; + + size_t ApproximateIndexBlockMemoryUsage() const { + assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr); + return index_block_.GetOwnValue() + ? index_block_.GetValue()->ApproximateMemoryUsage() + : 0; + } + + private: + const BlockBasedTable* table_; + CachableEntry index_block_; +}; + +Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block) { + PERF_TIMER_GUARD(read_index_block_nanos); + + assert(table != nullptr); + assert(index_block != nullptr); + assert(index_block->IsEmpty()); + + const Rep* const rep = table->get_rep(); + assert(rep != nullptr); + + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->footer.index_handle(), + UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex, + get_context, lookup_context, /* for_compaction */ false, use_cache); + + return s; +} + +Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* index_block) const { + assert(index_block != nullptr); + + if (!index_block_.IsEmpty()) { + index_block->SetUnownedValue(index_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options, + cache_index_blocks(), get_context, lookup_context, + index_block); +} + +// Index that allows binary search lookup in a two-level index structure. +class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + // Read the partition index from the file and create an instance for + // `PartitionIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + index_reader->reset( + new PartitionIndexReader(table, std::move(index_block))); + + return Status::OK(); + } + + // return a two-level iterator: first level is on the partition index + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + InternalIteratorBase* it = nullptr; + + Statistics* kNullStats = nullptr; + // Filters are already checked before seeking the index + if (!partition_map_.empty()) { + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + it = NewTwoLevelIterator( + new BlockBasedTable::PartitionedIndexIteratorState(table(), + &partition_map_), + index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), + nullptr, kNullStats, true, index_has_first_key(), + index_key_includes_seq(), index_value_is_full())); + } else { + ReadOptions ro; + ro.fill_cache = read_options.fill_cache; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + it = new BlockBasedTableIterator( + table(), ro, *internal_comparator(), + index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), + nullptr, kNullStats, true, index_has_first_key(), + index_key_includes_seq(), index_value_is_full()), + false, true, /* prefix_extractor */ nullptr, BlockType::kIndex, + lookup_context ? lookup_context->caller + : TableReaderCaller::kUncategorized); + } + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + + // TODO(myabandeh): Update TwoLevelIterator to be able to make use of + // on-stack BlockIter while the state is on heap. Currentlly it assumes + // the first level iter is always on heap and will attempt to delete it + // in its destructor. + } + + void CacheDependencies(bool pin) override { + // Before read partitions, prefetch them to avoid lots of IOs + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + const BlockBasedTable::Rep* rep = table()->rep_; + IndexBlockIter biter; + BlockHandle handle; + Statistics* kNullStats = nullptr; + + CachableEntry index_block; + Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &index_block); + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Error retrieving top-level index block while trying to " + "cache index partitions: %s", + s.ToString().c_str()); + return; + } + + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), &biter, + kNullStats, true, index_has_first_key(), index_key_includes_seq(), + index_value_is_full()); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + if (!biter.Valid()) { + // Empty index. + return; + } + handle = biter.value().handle; + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + if (!biter.Valid()) { + // Empty index. + return; + } + handle = biter.value().handle; + uint64_t last_off = handle.offset() + block_size(handle); + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr prefetch_buffer; + auto& file = rep->file; + prefetch_buffer.reset(new FilePrefetchBuffer()); + s = prefetch_buffer->Prefetch(file.get(), prefetch_off, + static_cast(prefetch_len)); + + // After prefetch, read the partitions one by one + biter.SeekToFirst(); + auto ro = ReadOptions(); + for (; biter.Valid(); biter.Next()) { + handle = biter.value().handle; + CachableEntry block; + // TODO: Support counter batch update for partitioned index and + // filter blocks + s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context, + /*contents=*/nullptr); + + assert(s.ok() || block.GetValue() == nullptr); + if (s.ok() && block.GetValue() != nullptr) { + if (block.IsCached()) { + if (pin) { + partition_map_[handle.offset()] = std::move(block); + } + } + } + } + } + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + // TODO(myabandeh): more accurate estimate of partition_map_ mem usage + return usage; + } + + private: + PartitionIndexReader(const BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} + + std::unordered_map> partition_map_; +}; + +// Index that allows binary search lookup for the first key of each block. +// This class can be viewed as a thin wrapper for `Block` class which already +// supports binary search. +class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + // Read index from the file and create an intance for + // `BinarySearchIndexReader`. + // On success, index_reader will be populated; otherwise it will remain + // unmodified. + static Status Create(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(index_reader != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + index_reader->reset( + new BinarySearchIndexReader(table, std::move(index_block))); + + return Status::OK(); + } + + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool /* disable_prefix_seek */, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + auto it = index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), iter, + kNullStats, true, index_has_first_key(), index_key_includes_seq(), + index_value_is_full()); + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + } + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + BinarySearchIndexReader(const BlockBasedTable* t, + CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} +}; + +// Index that leverages an internal hash table to quicken the lookup for a given +// key. +class HashIndexReader : public BlockBasedTable::IndexReaderCommon { + public: + static Status Create(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_index_iter, bool use_cache, + bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + assert(table != nullptr); + assert(index_reader != nullptr); + assert(!pin || prefetch); + + const BlockBasedTable::Rep* rep = table->get_rep(); + assert(rep != nullptr); + + CachableEntry index_block; + if (prefetch || !use_cache) { + const Status s = + ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache, + /*get_context=*/nullptr, lookup_context, &index_block); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + index_block.Reset(); + } + } + + // Note, failure to create prefix hash index does not need to be a + // hard error. We can still fall back to the original binary search index. + // So, Create will succeed regardless, from this point on. + + index_reader->reset(new HashIndexReader(table, std::move(index_block))); + + // Get prefixes block + BlockHandle prefixes_handle; + Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, + &prefixes_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + // Get index metadata block + BlockHandle prefixes_meta_handle; + s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock, + &prefixes_meta_handle); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + RandomAccessFileReader* const file = rep->file.get(); + const Footer& footer = rep->footer; + const ImmutableCFOptions& ioptions = rep->ioptions; + const PersistentCacheOptions& cache_options = rep->persistent_cache_options; + MemoryAllocator* const memory_allocator = + GetMemoryAllocator(rep->table_options); + + // Read contents for the blocks + BlockContents prefixes_contents; + BlockFetcher prefixes_block_fetcher( + file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, + &prefixes_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, BlockType::kHashIndexPrefixes, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = prefixes_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + return s; + } + BlockContents prefixes_meta_contents; + BlockFetcher prefixes_meta_block_fetcher( + file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, + &prefixes_meta_contents, ioptions, true /*decompress*/, + true /*maybe_compressed*/, BlockType::kHashIndexMetadata, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + s = prefixes_meta_block_fetcher.ReadBlockContents(); + if (!s.ok()) { + // TODO: log error + return Status::OK(); + } + + BlockPrefixIndex* prefix_index = nullptr; + s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(), + prefixes_contents.data, + prefixes_meta_contents.data, &prefix_index); + // TODO: log error + if (s.ok()) { + HashIndexReader* const hash_index_reader = + static_cast(index_reader->get()); + hash_index_reader->prefix_index_.reset(prefix_index); + } + + return Status::OK(); + } + + InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override { + const bool no_io = (read_options.read_tier == kBlockCacheTier); + CachableEntry index_block; + const Status s = + GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block); + if (!s.ok()) { + if (iter != nullptr) { + iter->Invalidate(s); + return iter; + } + + return NewErrorInternalIterator(s); + } + + Statistics* kNullStats = nullptr; + const bool total_order_seek = + read_options.total_order_seek || disable_prefix_seek; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + auto it = index_block.GetValue()->NewIndexIterator( + internal_comparator(), internal_comparator()->user_comparator(), iter, + kNullStats, total_order_seek, index_has_first_key(), + index_key_includes_seq(), index_value_is_full(), + false /* block_contents_pinned */, prefix_index_.get()); + + assert(it != nullptr); + index_block.TransferTo(it); + + return it; + } + + size_t ApproximateMemoryUsage() const override { + size_t usage = ApproximateIndexBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + if (prefix_index_) { + usage += prefix_index_->ApproximateMemoryUsage(); + } + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + } + + private: + HashIndexReader(const BlockBasedTable* t, CachableEntry&& index_block) + : IndexReaderCommon(t, std::move(index_block)) {} + + std::unique_ptr prefix_index_; +}; + +void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, + GetContext* get_context, + size_t usage) const { + Statistics* const statistics = rep_->ioptions.statistics; + + PERF_COUNTER_ADD(block_cache_hit_count, 1); + PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, + static_cast(rep_->level)); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_hit; + get_context->get_context_stats_.num_cache_bytes_read += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_HIT); + RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage); + } + + switch (block_type) { + case BlockType::kFilter: + PERF_COUNTER_ADD(block_cache_filter_hit_count, 1); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_HIT); + } + break; + + case BlockType::kCompressionDictionary: + // TODO: introduce perf counter for compression dictionary hit count + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT); + } + break; + + case BlockType::kIndex: + PERF_COUNTER_ADD(block_cache_index_hit_count, 1); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_HIT); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_hit; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_HIT); + } + break; + } +} + +void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type, + GetContext* get_context) const { + Statistics* const statistics = rep_->ioptions.statistics; + + // TODO: introduce aggregate (not per-level) block cache miss count + PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, + static_cast(rep_->level)); + + if (get_context) { + ++get_context->get_context_stats_.num_cache_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_MISS); + } + + // TODO: introduce perf counters for misses per block type + switch (block_type) { + case BlockType::kFilter: + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_MISS); + } + break; + + case BlockType::kCompressionDictionary: + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS); + } + break; + + case BlockType::kIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_MISS); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_miss; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_MISS); + } + break; + } +} + +void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, + size_t usage) const { + Statistics* const statistics = rep_->ioptions.statistics; + + // TODO: introduce perf counters for block cache insertions + if (get_context) { + ++get_context->get_context_stats_.num_cache_add; + get_context->get_context_stats_.num_cache_bytes_write += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); + } + + switch (block_type) { + case BlockType::kFilter: + if (get_context) { + ++get_context->get_context_stats_.num_cache_filter_add; + get_context->get_context_stats_.num_cache_filter_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); + RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); + } + break; + + case BlockType::kCompressionDictionary: + if (get_context) { + ++get_context->get_context_stats_.num_cache_compression_dict_add; + get_context->get_context_stats_ + .num_cache_compression_dict_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); + RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, + usage); + } + break; + + case BlockType::kIndex: + if (get_context) { + ++get_context->get_context_stats_.num_cache_index_add; + get_context->get_context_stats_.num_cache_index_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); + RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage); + } + break; + + default: + // TODO: introduce dedicated tickers/statistics/counters + // for range tombstones + if (get_context) { + ++get_context->get_context_stats_.num_cache_data_add; + get_context->get_context_stats_.num_cache_data_bytes_insert += usage; + } else { + RecordTick(statistics, BLOCK_CACHE_DATA_ADD); + RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage); + } + break; + } +} + +Cache::Handle* BlockBasedTable::GetEntryFromCache( + Cache* block_cache, const Slice& key, BlockType block_type, + GetContext* get_context) const { + auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics); + + if (cache_handle != nullptr) { + UpdateCacheHitMetrics(block_type, get_context, + block_cache->GetUsage(cache_handle)); + } else { + UpdateCacheMissMetrics(block_type, get_context); + } + + return cache_handle; +} + +// Helper function to setup the cache key's prefix for the Table. +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { + assert(kMaxCacheKeyPrefixSize >= 10); + rep->cache_key_prefix_size = 0; + rep->compressed_cache_key_prefix_size = 0; + if (rep->table_options.block_cache != nullptr) { + GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), + &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); + } + if (rep->table_options.persistent_cache != nullptr) { + GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(), + &rep->persistent_cache_key_prefix[0], + &rep->persistent_cache_key_prefix_size); + } + if (rep->table_options.block_cache_compressed != nullptr) { + GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), + rep->file->file(), &rep->compressed_cache_key_prefix[0], + &rep->compressed_cache_key_prefix_size); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file, + char* buffer, size_t* size) { + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (cc != nullptr && *size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } +} + +void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file, + char* buffer, size_t* size) { + // generate an id from the file + *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); + + // If the prefix wasn't generated or was too long, + // create one from the cache. + if (cc != nullptr && *size == 0) { + char* end = EncodeVarint64(buffer, cc->NewId()); + *size = static_cast(end - buffer); + } +} + +namespace { +// Return True if table_properties has `user_prop_name` has a `true` value +// or it doesn't contain this property (for backward compatible). +bool IsFeatureSupported(const TableProperties& table_properties, + const std::string& user_prop_name, Logger* info_log) { + auto& props = table_properties.user_collected_properties; + auto pos = props.find(user_prop_name); + // Older version doesn't have this value set. Skip this check. + if (pos != props.end()) { + if (pos->second == kPropFalse) { + return false; + } else if (pos->second != kPropTrue) { + ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s", + user_prop_name.c_str(), pos->second.c_str()); + } + } + return true; +} + +// Caller has to ensure seqno is not nullptr. +Status GetGlobalSequenceNumber(const TableProperties& table_properties, + SequenceNumber largest_seqno, + SequenceNumber* seqno) { + const auto& props = table_properties.user_collected_properties; + const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion); + const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno); + + *seqno = kDisableGlobalSequenceNumber; + if (version_pos == props.end()) { + if (seqno_pos != props.end()) { + std::array msg_buf; + // This is not an external sst file, global_seqno is not supported. + snprintf( + msg_buf.data(), msg_buf.max_size(), + "A non-external sst file have global seqno property with value %s", + seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); + } + return Status::OK(); + } + + uint32_t version = DecodeFixed32(version_pos->second.c_str()); + if (version < 2) { + if (seqno_pos != props.end() || version != 1) { + std::array msg_buf; + // This is a v1 external sst file, global_seqno is not supported. + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno " + "property with value %s", + version, seqno_pos->second.c_str()); + return Status::Corruption(msg_buf.data()); + } + return Status::OK(); + } + + // Since we have a plan to deprecate global_seqno, we do not return failure + // if seqno_pos == props.end(). We rely on version_pos to detect whether the + // SST is external. + SequenceNumber global_seqno(0); + if (seqno_pos != props.end()) { + global_seqno = DecodeFixed64(seqno_pos->second.c_str()); + } + // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno + // to denote it is unknown. + if (largest_seqno < kMaxSequenceNumber) { + if (global_seqno == 0) { + global_seqno = largest_seqno; + } + if (global_seqno != largest_seqno) { + std::array msg_buf; + snprintf( + msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %s, while largest seqno in the file is %llu", + version, seqno_pos->second.c_str(), + static_cast(largest_seqno)); + return Status::Corruption(msg_buf.data()); + } + } + *seqno = global_seqno; + + if (global_seqno > kMaxSequenceNumber) { + std::array msg_buf; + snprintf(msg_buf.data(), msg_buf.max_size(), + "An external sst file with version %u have global seqno property " + "with value %llu, which is greater than kMaxSequenceNumber", + version, static_cast(global_seqno)); + return Status::Corruption(msg_buf.data()); + } + + return Status::OK(); +} +} // namespace + +Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix, + size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key) { + assert(cache_key != nullptr); + assert(cache_key_prefix_size != 0); + assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); + memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); + char* end = + EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); + return Slice(cache_key, static_cast(end - cache_key)); +} + +Status BlockBasedTable::Open( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const BlockBasedTableOptions& table_options, + const InternalKeyComparator& internal_comparator, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* table_reader, + const SliceTransform* prefix_extractor, + const bool prefetch_index_and_filter_in_cache, const bool skip_filters, + const int level, const bool immortal_table, + const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats, + BlockCacheTracer* const block_cache_tracer) { + table_reader->reset(); + + Status s; + Footer footer; + std::unique_ptr prefetch_buffer; + + // prefetch both index and filters, down to all partitions + const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; + const bool preload_all = !table_options.cache_index_and_filter_blocks; + + s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all, + preload_all, &prefetch_buffer); + + // Read in the following order: + // 1. Footer + // 2. [metaindex block] + // 3. [meta block: properties] + // 4. [meta block: range deletion tombstone] + // 5. [meta block: compression dictionary] + // 6. [meta block: index] + // 7. [meta block: filter] + s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer, + kBlockBasedTableMagicNumber); + if (!s.ok()) { + return s; + } + if (!BlockBasedTableSupportedVersion(footer.version())) { + return Status::Corruption( + "Unknown Footer version. Maybe this file was created with newer " + "version of RocksDB?"); + } + + // We've successfully read the footer. We are ready to serve requests. + // Better not mutate rep_ after the creation. eg. internal_prefix_transform + // raw pointer will be used to create HashIndexReader, whose reset may + // access a dangling pointer. + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, + internal_comparator, skip_filters, level, + immortal_table); + rep->file = std::move(file); + rep->footer = footer; + rep->hash_index_allow_collision = table_options.hash_index_allow_collision; + // We need to wrap data with internal_prefix_transform to make sure it can + // handle prefix correctly. + rep->internal_prefix_transform.reset( + new InternalKeySliceTransform(prefix_extractor)); + SetupCacheKeyPrefix(rep); + std::unique_ptr new_table( + new BlockBasedTable(rep, block_cache_tracer)); + + // page cache options + rep->persistent_cache_options = + PersistentCacheOptions(rep->table_options.persistent_cache, + std::string(rep->persistent_cache_key_prefix, + rep->persistent_cache_key_prefix_size), + rep->ioptions.statistics); + + // Meta-blocks are not dictionary compressed. Explicitly set the dictionary + // handle to null, otherwise it may be seen as uninitialized during the below + // meta-block reads. + rep->compression_dict_handle = BlockHandle::NullBlockHandle(); + + // Read metaindex + std::unique_ptr metaindex; + std::unique_ptr metaindex_iter; + s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex, + &metaindex_iter); + if (!s.ok()) { + return s; + } + + // Populates table_properties and some fields that depend on it, + // such as index_type. + s = new_table->ReadPropertiesBlock(prefetch_buffer.get(), + metaindex_iter.get(), largest_seqno); + if (!s.ok()) { + return s; + } + s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(), + internal_comparator, &lookup_context); + if (!s.ok()) { + return s; + } + s = new_table->PrefetchIndexAndFilterBlocks( + prefetch_buffer.get(), metaindex_iter.get(), new_table.get(), + prefetch_all, table_options, level, &lookup_context); + + if (s.ok()) { + // Update tail prefetch stats + assert(prefetch_buffer.get() != nullptr); + if (tail_prefetch_stats != nullptr) { + assert(prefetch_buffer->min_offset_read() < file_size); + tail_prefetch_stats->RecordEffectiveSize( + static_cast(file_size) - prefetch_buffer->min_offset_read()); + } + + *table_reader = std::move(new_table); + } + + return s; +} + +Status BlockBasedTable::PrefetchTail( + RandomAccessFileReader* file, uint64_t file_size, + TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, + const bool preload_all, + std::unique_ptr* prefetch_buffer) { + size_t tail_prefetch_size = 0; + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the properties, + // at which point we don't yet know the index type. + tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + } + size_t prefetch_off; + size_t prefetch_len; + if (file_size < tail_prefetch_size) { + prefetch_off = 0; + prefetch_len = static_cast(file_size); + } else { + prefetch_off = static_cast(file_size - tail_prefetch_size); + prefetch_len = tail_prefetch_size; + } + TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", + &tail_prefetch_size); + Status s; + // TODO should not have this special logic in the future. + if (!file->use_direct_io()) { + prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true)); + s = file->Prefetch(prefetch_off, prefetch_len); + } else { + prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true)); + s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len); + } + return s; +} + +Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len, + uint32_t expected) { + Status s; + uint32_t actual = 0; + switch (type) { + case kNoChecksum: + break; + case kCRC32c: + expected = crc32c::Unmask(expected); + actual = crc32c::Value(buf, len); + break; + case kxxHash: + actual = XXH32(buf, static_cast(len), 0); + break; + case kxxHash64: + actual = static_cast(XXH64(buf, static_cast(len), 0) & + uint64_t{0xffffffff}); + break; + default: + s = Status::Corruption("unknown checksum type"); + } + if (s.ok() && actual != expected) { + s = Status::Corruption("properties block checksum mismatched"); + } + return s; +} + +Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( + FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, + TableProperties** table_properties) { + assert(table_properties != nullptr); + // If this is an external SST file ingested with write_global_seqno set to + // true, then we expect the checksum mismatch because checksum was written + // by SstFileWriter, but its global seqno in the properties block may have + // been changed during ingestion. In this case, we read the properties + // block, copy it to a memory buffer, change the global seqno to its + // original value, i.e. 0, and verify the checksum again. + BlockHandle props_block_handle; + CacheAllocationPtr tmp_buf; + Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer, + rep_->footer, rep_->ioptions, table_properties, + false /* verify_checksum */, &props_block_handle, + &tmp_buf, false /* compression_type_missing */, + nullptr /* memory_allocator */); + if (s.ok() && tmp_buf) { + const auto seqno_pos_iter = + (*table_properties) + ->properties_offsets.find( + ExternalSstFilePropertyNames::kGlobalSeqno); + size_t block_size = static_cast(props_block_handle.size()); + if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) { + uint64_t global_seqno_offset = seqno_pos_iter->second; + EncodeFixed64( + tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0); + } + uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1); + s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(), + block_size + 1, value); + } + return s; +} + +Status BlockBasedTable::ReadPropertiesBlock( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + const SequenceNumber largest_seqno) { + bool found_properties_block = true; + Status s; + s = SeekToPropertiesBlock(meta_iter, &found_properties_block); + + if (!s.ok()) { + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "Error when seeking to properties block from file: %s", + s.ToString().c_str()); + } else if (found_properties_block) { + s = meta_iter->status(); + TableProperties* table_properties = nullptr; + if (s.ok()) { + s = ReadProperties( + meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer, + rep_->ioptions, &table_properties, true /* verify_checksum */, + nullptr /* ret_block_handle */, nullptr /* ret_block_contents */, + false /* compression_type_missing */, nullptr /* memory_allocator */); + } + + if (s.IsCorruption()) { + s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(), + &table_properties); + } + std::unique_ptr props_guard; + if (table_properties != nullptr) { + props_guard.reset(table_properties); + } + + if (!s.ok()) { + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "Encountered error while reading data from properties " + "block %s", + s.ToString().c_str()); + } else { + assert(table_properties != nullptr); + rep_->table_properties.reset(props_guard.release()); + rep_->blocks_maybe_compressed = + rep_->table_properties->compression_name != + CompressionTypeToString(kNoCompression); + rep_->blocks_definitely_zstd_compressed = + (rep_->table_properties->compression_name == + CompressionTypeToString(kZSTD) || + rep_->table_properties->compression_name == + CompressionTypeToString(kZSTDNotFinalCompression)); + } + } else { + ROCKS_LOG_ERROR(rep_->ioptions.info_log, + "Cannot find Properties block from file."); + } +#ifndef ROCKSDB_LITE + if (rep_->table_properties) { + ParseSliceTransform(rep_->table_properties->prefix_extractor_name, + &(rep_->table_prefix_extractor)); + } +#endif // ROCKSDB_LITE + + // Read the table properties, if provided. + if (rep_->table_properties) { + rep_->whole_key_filtering &= + IsFeatureSupported(*(rep_->table_properties), + BlockBasedTablePropertyNames::kWholeKeyFiltering, + rep_->ioptions.info_log); + rep_->prefix_filtering &= + IsFeatureSupported(*(rep_->table_properties), + BlockBasedTablePropertyNames::kPrefixFiltering, + rep_->ioptions.info_log); + + rep_->index_key_includes_seq = + rep_->table_properties->index_key_is_user_key == 0; + rep_->index_value_is_full = + rep_->table_properties->index_value_is_delta_encoded == 0; + + // Update index_type with the true type. + // If table properties don't contain index type, we assume that the table + // is in very old format and has kBinarySearch index type. + auto& props = rep_->table_properties->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + rep_->index_type = static_cast( + DecodeFixed32(pos->second.c_str())); + } + + rep_->index_has_first_key = + rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey; + + s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno, + &(rep_->global_seqno)); + if (!s.ok()) { + ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str()); + } + } + return s; +} + +Status BlockBasedTable::ReadRangeDelBlock( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator, + BlockCacheLookupContext* lookup_context) { + Status s; + bool found_range_del_block; + BlockHandle range_del_handle; + s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep_->ioptions.info_log, + "Error when seeking to range delete tombstones block from file: %s", + s.ToString().c_str()); + } else if (found_range_del_block && !range_del_handle.IsNull()) { + ReadOptions read_options; + std::unique_ptr iter(NewDataBlockIterator( + read_options, range_del_handle, + /*input_iter=*/nullptr, BlockType::kRangeDeletion, + /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer)); + assert(iter != nullptr); + s = iter->status(); + if (!s.ok()) { + ROCKS_LOG_WARN( + rep_->ioptions.info_log, + "Encountered error while reading data from range del block %s", + s.ToString().c_str()); + } else { + rep_->fragmented_range_dels = + std::make_shared(std::move(iter), + internal_comparator); + } + } + return s; +} + +Status BlockBasedTable::PrefetchIndexAndFilterBlocks( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + BlockBasedTable* new_table, bool prefetch_all, + const BlockBasedTableOptions& table_options, const int level, + BlockCacheLookupContext* lookup_context) { + Status s; + + // Find filter handle and filter type + if (rep_->filter_policy) { + for (auto filter_type : + {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter, + Rep::FilterType::kBlockFilter}) { + std::string prefix; + switch (filter_type) { + case Rep::FilterType::kFullFilter: + prefix = kFullFilterBlockPrefix; + break; + case Rep::FilterType::kPartitionedFilter: + prefix = kPartitionedFilterBlockPrefix; + break; + case Rep::FilterType::kBlockFilter: + prefix = kFilterBlockPrefix; + break; + default: + assert(0); + } + std::string filter_block_key = prefix; + filter_block_key.append(rep_->filter_policy->Name()); + if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle) + .ok()) { + rep_->filter_type = filter_type; + break; + } + } + } + + // Find compression dictionary handle + bool found_compression_dict = false; + s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, + &rep_->compression_dict_handle); + if (!s.ok()) { + return s; + } + + BlockBasedTableOptions::IndexType index_type = rep_->index_type; + + const bool use_cache = table_options.cache_index_and_filter_blocks; + + // pin both index and filters, down to all partitions + const bool pin_all = + rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; + + // prefetch the first level of index + const bool prefetch_index = + prefetch_all || + (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + // pin the first level of index + const bool pin_index = + pin_all || (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + + std::unique_ptr index_reader; + s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache, + prefetch_index, pin_index, lookup_context, + &index_reader); + if (!s.ok()) { + return s; + } + + rep_->index_reader = std::move(index_reader); + + // The partitions of partitioned index are always stored in cache. They + // are hence follow the configuration for pin and prefetch regardless of + // the value of cache_index_and_filter_blocks + if (prefetch_all) { + rep_->index_reader->CacheDependencies(pin_all); + } + + // prefetch the first level of filter + const bool prefetch_filter = + prefetch_all || + (table_options.pin_top_level_index_and_filter && + rep_->filter_type == Rep::FilterType::kPartitionedFilter); + // Partition fitlers cannot be enabled without partition indexes + assert(!prefetch_filter || prefetch_index); + // pin the first level of filter + const bool pin_filter = + pin_all || (table_options.pin_top_level_index_and_filter && + rep_->filter_type == Rep::FilterType::kPartitionedFilter); + + if (rep_->filter_policy) { + auto filter = new_table->CreateFilterBlockReader( + prefetch_buffer, use_cache, prefetch_filter, pin_filter, + lookup_context); + if (filter) { + // Refer to the comment above about paritioned indexes always being cached + if (prefetch_all) { + filter->CacheDependencies(pin_all); + } + + rep_->filter = std::move(filter); + } + } + + if (!rep_->compression_dict_handle.IsNull()) { + std::unique_ptr uncompression_dict_reader; + s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache, + prefetch_all, pin_all, lookup_context, + &uncompression_dict_reader); + if (!s.ok()) { + return s; + } + + rep_->uncompression_dict_reader = std::move(uncompression_dict_reader); + } + + assert(s.ok()); + return s; +} + +void BlockBasedTable::SetupForCompaction() { + switch (rep_->ioptions.access_hint_on_compaction_start) { + case Options::NONE: + break; + case Options::NORMAL: + rep_->file->file()->Hint(RandomAccessFile::NORMAL); + break; + case Options::SEQUENTIAL: + rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL); + break; + case Options::WILLNEED: + rep_->file->file()->Hint(RandomAccessFile::WILLNEED); + break; + default: + assert(false); + } +} + +std::shared_ptr BlockBasedTable::GetTableProperties() + const { + return rep_->table_properties; +} + +size_t BlockBasedTable::ApproximateMemoryUsage() const { + size_t usage = 0; + if (rep_->filter) { + usage += rep_->filter->ApproximateMemoryUsage(); + } + if (rep_->index_reader) { + usage += rep_->index_reader->ApproximateMemoryUsage(); + } + if (rep_->uncompression_dict_reader) { + usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage(); + } + return usage; +} + +// Load the meta-index-block from the file. On success, return the loaded +// metaindex +// block and its iterator. +Status BlockBasedTable::ReadMetaIndexBlock( + FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr* metaindex_block, + std::unique_ptr* iter) { + // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates + // it is an empty block. + std::unique_ptr metaindex; + Status s = ReadBlockFromFile( + rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(), + rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions, + true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options, + kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, + GetMemoryAllocator(rep_->table_options), false /* for_compaction */, + rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */); + + if (!s.ok()) { + ROCKS_LOG_ERROR(rep_->ioptions.info_log, + "Encountered error while reading data from properties" + " block %s", + s.ToString().c_str()); + return s; + } + + *metaindex_block = std::move(metaindex); + // meta block uses bytewise comparator. + iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(), + BytewiseComparator())); + return Status::OK(); +} + +template +Status BlockBasedTable::GetDataBlockFromCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, CachableEntry* block, + const UncompressionDict& uncompression_dict, BlockType block_type, + GetContext* get_context) const { + const size_t read_amp_bytes_per_bit = + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0; + assert(block); + assert(block->IsEmpty()); + + Status s; + BlockContents* compressed_block = nullptr; + Cache::Handle* block_cache_compressed_handle = nullptr; + + // Lookup uncompressed cache first + if (block_cache != nullptr) { + auto cache_handle = GetEntryFromCache(block_cache, block_cache_key, + block_type, get_context); + if (cache_handle != nullptr) { + block->SetCachedValue( + reinterpret_cast(block_cache->Value(cache_handle)), + block_cache, cache_handle); + return s; + } + } + + // If not found, search from the compressed block cache. + assert(block->IsEmpty()); + + if (block_cache_compressed == nullptr) { + return s; + } + + assert(!compressed_block_cache_key.empty()); + block_cache_compressed_handle = + block_cache_compressed->Lookup(compressed_block_cache_key); + + Statistics* statistics = rep_->ioptions.statistics; + + // if we found in the compressed cache, then uncompress and insert into + // uncompressed cache + if (block_cache_compressed_handle == nullptr) { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); + return s; + } + + // found compressed block + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); + compressed_block = reinterpret_cast( + block_cache_compressed->Value(block_cache_compressed_handle)); + CompressionType compression_type = compressed_block->get_compression_type(); + assert(compression_type != kNoCompression); + + // Retrieve the uncompressed contents into a new buffer + BlockContents contents; + UncompressionContext context(compression_type); + UncompressionInfo info(context, uncompression_dict, compression_type); + s = UncompressBlockContents( + info, compressed_block->data.data(), compressed_block->data.size(), + &contents, rep_->table_options.format_version, rep_->ioptions, + GetMemoryAllocator(rep_->table_options)); + + // Insert uncompressed block into block cache + if (s.ok()) { + std::unique_ptr block_holder( + BlocklikeTraits::Create( + std::move(contents), rep_->get_global_seqno(block_type), + read_amp_bytes_per_bit, statistics, + rep_->blocks_definitely_zstd_compressed, + rep_->table_options.filter_policy.get())); // uncompressed block + + if (block_cache != nullptr && block_holder->own_bytes() && + read_options.fill_cache) { + size_t charge = block_holder->ApproximateMemoryUsage(); + Cache::Handle* cache_handle = nullptr; + s = block_cache->Insert(block_cache_key, block_holder.get(), charge, + &DeleteCachedEntry, &cache_handle); + if (s.ok()) { + assert(cache_handle != nullptr); + block->SetCachedValue(block_holder.release(), block_cache, + cache_handle); + + UpdateCacheInsertionMetrics(block_type, get_context, charge); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + } + } else { + block->SetOwnedValue(block_holder.release()); + } + } + + // Release hold on compressed cache entry + block_cache_compressed->Release(block_cache_compressed_handle); + return s; +} + +template +Status BlockBasedTable::PutDataBlockToCache( + const Slice& block_cache_key, const Slice& compressed_block_cache_key, + Cache* block_cache, Cache* block_cache_compressed, + CachableEntry* cached_block, BlockContents* raw_block_contents, + CompressionType raw_block_comp_type, + const UncompressionDict& uncompression_dict, SequenceNumber seq_no, + MemoryAllocator* memory_allocator, BlockType block_type, + GetContext* get_context) const { + const ImmutableCFOptions& ioptions = rep_->ioptions; + const uint32_t format_version = rep_->table_options.format_version; + const size_t read_amp_bytes_per_bit = + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0; + const Cache::Priority priority = + rep_->table_options.cache_index_and_filter_blocks_with_high_priority && + (block_type == BlockType::kFilter || + block_type == BlockType::kCompressionDictionary || + block_type == BlockType::kIndex) + ? Cache::Priority::HIGH + : Cache::Priority::LOW; + assert(cached_block); + assert(cached_block->IsEmpty()); + + Status s; + Statistics* statistics = ioptions.statistics; + + std::unique_ptr block_holder; + if (raw_block_comp_type != kNoCompression) { + // Retrieve the uncompressed contents into a new buffer + BlockContents uncompressed_block_contents; + UncompressionContext context(raw_block_comp_type); + UncompressionInfo info(context, uncompression_dict, raw_block_comp_type); + s = UncompressBlockContents(info, raw_block_contents->data.data(), + raw_block_contents->data.size(), + &uncompressed_block_contents, format_version, + ioptions, memory_allocator); + if (!s.ok()) { + return s; + } + + block_holder.reset(BlocklikeTraits::Create( + std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit, + statistics, rep_->blocks_definitely_zstd_compressed, + rep_->table_options.filter_policy.get())); + } else { + block_holder.reset(BlocklikeTraits::Create( + std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit, + statistics, rep_->blocks_definitely_zstd_compressed, + rep_->table_options.filter_policy.get())); + } + + // Insert compressed block into compressed block cache. + // Release the hold on the compressed cache entry immediately. + if (block_cache_compressed != nullptr && + raw_block_comp_type != kNoCompression && raw_block_contents != nullptr && + raw_block_contents->own_bytes()) { +#ifndef NDEBUG + assert(raw_block_contents->is_raw_block); +#endif // NDEBUG + + // We cannot directly put raw_block_contents because this could point to + // an object in the stack. + BlockContents* block_cont_for_comp_cache = + new BlockContents(std::move(*raw_block_contents)); + s = block_cache_compressed->Insert( + compressed_block_cache_key, block_cont_for_comp_cache, + block_cont_for_comp_cache->ApproximateMemoryUsage(), + &DeleteCachedEntry); + if (s.ok()) { + // Avoid the following code to delete this cached block. + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + delete block_cont_for_comp_cache; + } + } + + // insert into uncompressed block cache + if (block_cache != nullptr && block_holder->own_bytes()) { + size_t charge = block_holder->ApproximateMemoryUsage(); + Cache::Handle* cache_handle = nullptr; + s = block_cache->Insert(block_cache_key, block_holder.get(), charge, + &DeleteCachedEntry, &cache_handle, + priority); + if (s.ok()) { + assert(cache_handle != nullptr); + cached_block->SetCachedValue(block_holder.release(), block_cache, + cache_handle); + + UpdateCacheInsertionMetrics(block_type, get_context, charge); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + } + } else { + cached_block->SetOwnedValue(block_holder.release()); + } + + return s; +} + +std::unique_ptr BlockBasedTable::CreateFilterBlockReader( + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context) { + auto& rep = rep_; + auto filter_type = rep->filter_type; + if (filter_type == Rep::FilterType::kNoFilter) { + return std::unique_ptr(); + } + + assert(rep->filter_policy); + + switch (filter_type) { + case Rep::FilterType::kPartitionedFilter: + return PartitionedFilterBlockReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); + + case Rep::FilterType::kBlockFilter: + return BlockBasedFilterBlockReader::Create( + this, prefetch_buffer, use_cache, prefetch, pin, lookup_context); + + case Rep::FilterType::kFullFilter: + return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context); + + default: + // filter_type is either kNoFilter (exited the function at the first if), + // or it must be covered in this switch block + assert(false); + return std::unique_ptr(); + } +} + +// disable_prefix_seek should be set to true when prefix_extractor found in SST +// differs from the one in mutable_cf_options and index type is HashBasedIndex +InternalIteratorBase* BlockBasedTable::NewIndexIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* input_iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + assert(rep_ != nullptr); + assert(rep_->index_reader != nullptr); + + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + return rep_->index_reader->NewIterator(read_options, disable_prefix_seek, + input_iter, get_context, + lookup_context); +} + +// Convert an index iterator value (i.e., an encoded BlockHandle) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template +TBlockIter* BlockBasedTable::NewDataBlockIterator( + const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, Status s, + FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + CachableEntry uncompression_dict; + if (rep_->uncompression_dict_reader) { + const bool no_io = (ro.read_tier == kBlockCacheTier); + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + prefetch_buffer, no_io, get_context, lookup_context, + &uncompression_dict); + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + } + + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); + + CachableEntry block; + s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type, + get_context, lookup_context, for_compaction, + /* use_cache */ true); + + if (!s.ok()) { + assert(block.IsEmpty()); + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator(rep_, block.GetValue(), iter, + block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { + // insert a dummy record to block cache to track the memory usage + Cache* const block_cache = rep_->table_options.block_cache.get(); + Cache::Handle* cache_handle = nullptr; + // There are two other types of cache keys: 1) SST cache key added in + // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in + // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate + // from SST cache key(31 bytes), and use non-zero prefix to + // differentiate from `write_buffer_manager` + const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; + char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; + // Prefix: use rep_->cache_key_prefix padded by 0s + memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); + assert(rep_->cache_key_prefix_size != 0); + assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix); + memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, + next_cache_key_id_++); + assert(end - cache_key <= + static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); + const Slice unique_key(cache_key, static_cast(end - cache_key)); + s = block_cache->Insert(unique_key, nullptr, + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); + + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + + return iter; +} + +template <> +DataBlockIter* BlockBasedTable::InitBlockIterator( + const Rep* rep, Block* block, DataBlockIter* input_iter, + bool block_contents_pinned) { + return block->NewDataIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + input_iter, rep->ioptions.statistics, block_contents_pinned); +} + +template <> +IndexBlockIter* BlockBasedTable::InitBlockIterator( + const Rep* rep, Block* block, IndexBlockIter* input_iter, + bool block_contents_pinned) { + return block->NewIndexIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + input_iter, rep->ioptions.statistics, /* total_order_seek */ true, + rep->index_has_first_key, rep->index_key_includes_seq, + rep->index_value_is_full, block_contents_pinned); +} + +// Convert an uncompressed data block (i.e CachableEntry) +// into an iterator over the contents of the corresponding block. +// If input_iter is null, new a iterator +// If input_iter is not null, update this iter and return it +template +TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro, + CachableEntry& block, + TBlockIter* input_iter, + Status s) const { + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; + if (!s.ok()) { + iter->Invalidate(s); + return iter; + } + + assert(block.GetValue() != nullptr); + // Block contents are pinned and it is still pinned after the iterator + // is destroyed as long as cleanup functions are moved to another object, + // when: + // 1. block cache handle is set to be released in cleanup function, or + // 2. it's pointing to immortal source. If own_bytes is true then we are + // not reading data from the original source, whether immortal or not. + // Otherwise, the block is pinned iff the source is immortal. + const bool block_contents_pinned = + block.IsCached() || + (!block.GetValue()->own_bytes() && rep_->immortal_table); + iter = InitBlockIterator(rep_, block.GetValue(), iter, + block_contents_pinned); + + if (!block.IsCached()) { + if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) { + // insert a dummy record to block cache to track the memory usage + Cache* const block_cache = rep_->table_options.block_cache.get(); + Cache::Handle* cache_handle = nullptr; + // There are two other types of cache keys: 1) SST cache key added in + // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in + // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate + // from SST cache key(31 bytes), and use non-zero prefix to + // differentiate from `write_buffer_manager` + const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; + char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; + // Prefix: use rep_->cache_key_prefix padded by 0s + memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); + assert(rep_->cache_key_prefix_size != 0); + assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix); + memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size); + char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, + next_cache_key_id_++); + assert(end - cache_key <= + static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); + const Slice unique_key(cache_key, static_cast(end - cache_key)); + s = block_cache->Insert(unique_key, nullptr, + block.GetValue()->ApproximateMemoryUsage(), + nullptr, &cache_handle); + if (s.ok()) { + assert(cache_handle != nullptr); + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + cache_handle); + } + } + } else { + iter->SetCacheHandle(block.GetCacheHandle()); + } + + block.TransferTo(iter); + return iter; +} + +// If contents is nullptr, this function looks up the block caches for the +// data block referenced by handle, and read the block from disk if necessary. +// If contents is non-null, it skips the cache lookup and disk read, since +// the caller has already read it. In both cases, if ro.fill_cache is true, +// it inserts the block into the block cache. +template +Status BlockBasedTable::MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents) const { + assert(block_entry != nullptr); + const bool no_io = (ro.read_tier == kBlockCacheTier); + Cache* block_cache = rep_->table_options.block_cache.get(); + // No point to cache compressed blocks if it never goes away + Cache* block_cache_compressed = + rep_->immortal_table ? nullptr + : rep_->table_options.block_cache_compressed.get(); + + // First, try to get the block from the cache + // + // If either block cache is enabled, we'll try to read from it. + Status s; + char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice key /* key to the block cache */; + Slice ckey /* key to the compressed block cache */; + bool is_cache_hit = false; + if (block_cache != nullptr || block_cache_compressed != nullptr) { + // create key for block cache + if (block_cache != nullptr) { + key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + handle, cache_key); + } + + if (block_cache_compressed != nullptr) { + ckey = GetCacheKey(rep_->compressed_cache_key_prefix, + rep_->compressed_cache_key_prefix_size, handle, + compressed_cache_key); + } + + if (!contents) { + s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, + ro, block_entry, uncompression_dict, block_type, + get_context); + if (block_entry->GetValue()) { + // TODO(haoyu): Differentiate cache hit on uncompressed block cache and + // compressed block cache. + is_cache_hit = true; + } + } + + // Can't find the block from the cache. If I/O is allowed, read from the + // file. + if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) { + Statistics* statistics = rep_->ioptions.statistics; + const bool maybe_compressed = + block_type != BlockType::kFilter && + block_type != BlockType::kCompressionDictionary && + rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed && !block_cache_compressed; + CompressionType raw_block_comp_type; + BlockContents raw_block_contents; + if (!contents) { + StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); + BlockFetcher block_fetcher( + rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, + &raw_block_contents, rep_->ioptions, do_uncompress, + maybe_compressed, block_type, uncompression_dict, + rep_->persistent_cache_options, + GetMemoryAllocator(rep_->table_options), + GetMemoryAllocatorForCompressedBlock(rep_->table_options)); + s = block_fetcher.ReadBlockContents(); + raw_block_comp_type = block_fetcher.get_compression_type(); + contents = &raw_block_contents; + } else { + raw_block_comp_type = contents->get_compression_type(); + } + + if (s.ok()) { + SequenceNumber seq_no = rep_->get_global_seqno(block_type); + // If filling cache is allowed and a cache is configured, try to put the + // block to the cache. + s = PutDataBlockToCache( + key, ckey, block_cache, block_cache_compressed, block_entry, + contents, raw_block_comp_type, uncompression_dict, seq_no, + GetMemoryAllocator(rep_->table_options), block_type, get_context); + } + } + } + + // Fill lookup_context. + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && + lookup_context) { + size_t usage = 0; + uint64_t nkeys = 0; + if (block_entry->GetValue()) { + // Approximate the number of keys in the block using restarts. + nkeys = + rep_->table_options.block_restart_interval * + BlocklikeTraits::GetNumRestarts(*block_entry->GetValue()); + usage = block_entry->GetValue()->ApproximateMemoryUsage(); + } + TraceType trace_block_type = TraceType::kTraceMax; + switch (block_type) { + case BlockType::kData: + trace_block_type = TraceType::kBlockTraceDataBlock; + break; + case BlockType::kFilter: + trace_block_type = TraceType::kBlockTraceFilterBlock; + break; + case BlockType::kCompressionDictionary: + trace_block_type = TraceType::kBlockTraceUncompressionDictBlock; + break; + case BlockType::kRangeDeletion: + trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; + break; + case BlockType::kIndex: + trace_block_type = TraceType::kBlockTraceIndexBlock; + break; + default: + // This cannot happen. + assert(false); + break; + } + bool no_insert = no_io || !ro.fill_cache; + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock( + trace_block_type, lookup_context->caller)) { + // Defer logging the access to Get() and MultiGet() to trace additional + // information, e.g., referenced_key_exist_in_block. + + // Make a copy of the block key here since it will be logged later. + lookup_context->FillLookupContext( + is_cache_hit, no_insert, trace_block_type, + /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys); + } else { + // Avoid making copy of block_key and cf_name when constructing the access + // record. + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", trace_block_type, + /*block_size=*/usage, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, + no_insert, lookup_context->get_id, + lookup_context->get_from_user_specified_snapshot, + /*referenced_key=*/""); + block_cache_tracer_->WriteBlockAccess(access_record, key, + rep_->cf_name_for_tracing(), + lookup_context->referenced_key); + } + } + + assert(s.ok() || block_entry->GetValue() == nullptr); + return s; +} + +// This function reads multiple data blocks from disk using Env::MultiRead() +// and optionally inserts them into the block cache. It uses the scratch +// buffer provided by the caller, which is contiguous. If scratch is a nullptr +// it allocates a separate buffer for each block. Typically, if the blocks +// need to be uncompressed and there is no compressed block cache, callers +// can allocate a temporary scratch buffer in order to minimize memory +// allocations. +// If options.fill_cache is true, it inserts the blocks into cache. If its +// false and scratch is non-null and the blocks are uncompressed, it copies +// the buffers to heap. In any case, the CachableEntry returned will +// own the data bytes. +// batch - A MultiGetRange with only those keys with unique data blocks not +// found in cache +// handles - A vector of block handles. Some of them me be NULL handles +// scratch - An optional contiguous buffer to read compressed blocks into +void BlockBasedTable::RetrieveMultipleBlocks( + const ReadOptions& options, const MultiGetRange* batch, + const autovector* handles, + autovector* statuses, + autovector, MultiGetContext::MAX_BATCH_SIZE>* results, + char* scratch, const UncompressionDict& uncompression_dict) const { + RandomAccessFileReader* file = rep_->file.get(); + const Footer& footer = rep_->footer; + const ImmutableCFOptions& ioptions = rep_->ioptions; + SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData); + size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit; + MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options); + + if (file->use_direct_io() || ioptions.allow_mmap_reads) { + size_t idx_in_batch = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + const BlockHandle& handle = (*handles)[idx_in_batch]; + if (handle.IsNull()) { + continue; + } + + (*statuses)[idx_in_batch] = + RetrieveBlock(nullptr, options, handle, uncompression_dict, + &(*results)[idx_in_batch], BlockType::kData, + mget_iter->get_context, &lookup_data_block_context, + /* for_compaction */ false, /* use_cache */ true); + } + return; + } + + autovector read_reqs; + size_t buf_offset = 0; + size_t idx_in_batch = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + if (handle.IsNull()) { + continue; + } + + ReadRequest req; + req.len = block_size(handle); + if (scratch == nullptr) { + req.scratch = new char[req.len]; + } else { + req.scratch = scratch + buf_offset; + buf_offset += req.len; + } + req.offset = handle.offset(); + req.status = Status::OK(); + read_reqs.emplace_back(req); + } + + file->MultiRead(&read_reqs[0], read_reqs.size()); + + size_t read_req_idx = 0; + idx_in_batch = 0; + for (auto mget_iter = batch->begin(); mget_iter != batch->end(); + ++mget_iter, ++idx_in_batch) { + const BlockHandle& handle = (*handles)[idx_in_batch]; + + if (handle.IsNull()) { + continue; + } + + ReadRequest& req = read_reqs[read_req_idx++]; + Status s = req.status; + if (s.ok()) { + if (req.result.size() != req.len) { + s = Status::Corruption("truncated block read from " + + rep_->file->file_name() + " offset " + + ToString(handle.offset()) + ", expected " + + ToString(req.len) + + " bytes, got " + ToString(req.result.size())); + } + } + + BlockContents raw_block_contents; + if (s.ok()) { + if (scratch == nullptr) { + // We allocated a buffer for this block. Give ownership of it to + // BlockContents so it can free the memory + assert(req.result.data() == req.scratch); + std::unique_ptr raw_block(req.scratch); + raw_block_contents = BlockContents(std::move(raw_block), handle.size()); + } else { + // We used the scratch buffer, so no need to free anything + raw_block_contents = BlockContents(Slice(req.scratch, handle.size())); + } +#ifndef NDEBUG + raw_block_contents.is_raw_block = true; +#endif + if (options.verify_checksums) { + PERF_TIMER_GUARD(block_checksum_time); + const char* data = req.result.data(); + uint32_t expected = DecodeFixed32(data + handle.size() + 1); + s = rocksdb::VerifyChecksum(footer.checksum(), req.result.data(), + handle.size() + 1, expected); + } + } + if (s.ok()) { + if (options.fill_cache) { + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + CachableEntry* block_entry = &(*results)[idx_in_batch]; + // MaybeReadBlockAndLoadToCache will insert into the block caches if + // necessary. Since we're passing the raw block contents, it will + // avoid looking up the block cache + s = MaybeReadBlockAndLoadToCache( + nullptr, options, handle, uncompression_dict, block_entry, + BlockType::kData, mget_iter->get_context, + &lookup_data_block_context, &raw_block_contents); + + // block_entry value could be null if no block cache is present, i.e + // BlockBasedTableOptions::no_block_cache is true and no compressed + // block cache is configured. In that case, fall + // through and set up the block explicitly + if (block_entry->GetValue() != nullptr) { + continue; + } + } + + CompressionType compression_type = + raw_block_contents.get_compression_type(); + BlockContents contents; + if (compression_type != kNoCompression) { + UncompressionContext context(compression_type); + UncompressionInfo info(context, uncompression_dict, compression_type); + s = UncompressBlockContents(info, req.result.data(), handle.size(), + &contents, footer.version(), + rep_->ioptions, memory_allocator); + } else { + if (scratch != nullptr) { + // If we used the scratch buffer, then the contents need to be + // copied to heap + Slice raw = Slice(req.result.data(), handle.size()); + contents = BlockContents( + CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw), + handle.size()); + } else { + contents = std::move(raw_block_contents); + } + } + if (s.ok()) { + (*results)[idx_in_batch].SetOwnedValue( + new Block(std::move(contents), global_seqno, + read_amp_bytes_per_bit, ioptions.statistics)); + } + } + (*statuses)[idx_in_batch] = s; + } +} + +template +Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const { + assert(block_entry); + assert(block_entry->IsEmpty()); + + Status s; + if (use_cache) { + s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, + uncompression_dict, block_entry, + block_type, get_context, lookup_context, + /*contents=*/nullptr); + + if (!s.ok()) { + return s; + } + + if (block_entry->GetValue() != nullptr) { + assert(s.ok()); + return s; + } + } + + assert(block_entry->IsEmpty()); + + const bool no_io = ro.read_tier == kBlockCacheTier; + if (no_io) { + return Status::Incomplete("no blocking io"); + } + + const bool maybe_compressed = + block_type != BlockType::kFilter && + block_type != BlockType::kCompressionDictionary && + rep_->blocks_maybe_compressed; + const bool do_uncompress = maybe_compressed; + std::unique_ptr block; + + { + StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics, + READ_BLOCK_GET_MICROS); + s = ReadBlockFromFile( + rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, + rep_->ioptions, do_uncompress, maybe_compressed, block_type, + uncompression_dict, rep_->persistent_cache_options, + rep_->get_global_seqno(block_type), + block_type == BlockType::kData + ? rep_->table_options.read_amp_bytes_per_bit + : 0, + GetMemoryAllocator(rep_->table_options), for_compaction, + rep_->blocks_definitely_zstd_compressed, + rep_->table_options.filter_policy.get()); + } + + if (!s.ok()) { + return s; + } + + block_entry->SetOwnedValue(block.release()); + + assert(s.ok()); + return s; +} + +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + +template Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + +template Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + +template Status BlockBasedTable::RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + +BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( + const BlockBasedTable* table, + std::unordered_map>* block_map) + : table_(table), block_map_(block_map) {} + +InternalIteratorBase* +BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( + const BlockHandle& handle) { + // Return a block iterator on the index partition + auto block = block_map_->find(handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (block != block_map_->end()) { + const Rep* rep = table_->get_rep(); + assert(rep); + + Statistics* kNullStats = nullptr; + // We don't return pinned data from index blocks, so no need + // to set `block_contents_pinned`. + return block->second.GetValue()->NewIndexIterator( + &rep->internal_comparator, rep->internal_comparator.user_comparator(), + nullptr, kNullStats, true, rep->index_has_first_key, + rep->index_key_includes_seq, rep->index_value_is_full); + } + // Create an empty iterator + return new IndexBlockIter(); +} + +// This will be broken if the user specifies an unusual implementation +// of Options.comparator, or if the user specifies an unusual +// definition of prefixes in BlockBasedTableOptions.filter_policy. +// In particular, we require the following three properties: +// +// 1) key.starts_with(prefix(key)) +// 2) Compare(prefix(key), key) <= 0. +// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 +// +// Otherwise, this method guarantees no I/O will be incurred. +// +// REQUIRES: this method shouldn't be called while the DB lock is held. +bool BlockBasedTable::PrefixMayMatch( + const Slice& internal_key, const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) const { + if (!rep_->filter_policy) { + return true; + } + + const SliceTransform* prefix_extractor; + + if (rep_->table_prefix_extractor == nullptr) { + if (need_upper_bound_check) { + return true; + } + prefix_extractor = options_prefix_extractor; + } else { + prefix_extractor = rep_->table_prefix_extractor.get(); + } + auto user_key = ExtractUserKey(internal_key); + if (!prefix_extractor->InDomain(user_key)) { + return true; + } + + bool may_match = true; + Status s; + + // First, try check with full filter + FilterBlockReader* const filter = rep_->filter.get(); + bool filter_checked = true; + if (filter != nullptr) { + if (!filter->IsBlockBased()) { + const Slice* const const_ikey_ptr = &internal_key; + may_match = filter->RangeMayExist( + read_options.iterate_upper_bound, user_key, prefix_extractor, + rep_->internal_comparator.user_comparator(), const_ikey_ptr, + &filter_checked, need_upper_bound_check, lookup_context); + } else { + // if prefix_extractor changed for block based filter, skip filter + if (need_upper_bound_check) { + return true; + } + auto prefix = prefix_extractor->Transform(user_key); + InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); + auto internal_prefix = internal_key_prefix.Encode(); + + // To prevent any io operation in this method, we set `read_tier` to make + // sure we always read index or filter only when they have already been + // loaded to memory. + ReadOptions no_io_read_options; + no_io_read_options.read_tier = kBlockCacheTier; + + // Then, try find it within each block + // we already know prefix_extractor and prefix_extractor_name must match + // because `CheckPrefixMayMatch` first checks `check_filter_ == true` + std::unique_ptr> iiter(NewIndexIterator( + no_io_read_options, + /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, + /*get_context=*/nullptr, lookup_context)); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key()) + : iiter->key()) + .starts_with(ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else if (filter->IsBlockBased()) { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only on could potentially contain the prefix. + BlockHandle handle = iiter->value().handle; + may_match = filter->PrefixMayMatch( + prefix, prefix_extractor, handle.offset(), /*no_io=*/false, + /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context); + } + } + } + + if (filter_checked) { + Statistics* statistics = rep_->ioptions.statistics; + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + } + } + + return may_match; +} + +template +void BlockBasedTableIterator::Seek(const Slice& target) { + SeekImpl(&target); +} + +template +void BlockBasedTableIterator::SeekToFirst() { + SeekImpl(nullptr); +} + +template +void BlockBasedTableIterator::SeekImpl( + const Slice* target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + if (target && !CheckPrefixMayMatch(*target)) { + ResetDataIter(); + return; + } + + bool need_seek_index = true; + if (block_iter_points_to_real_block_ && block_iter_.Valid()) { + // Reseek. + prev_block_offset_ = index_iter_->value().handle.offset(); + + if (target) { + // We can avoid an index seek if: + // 1. The new seek key is larger than the current key + // 2. The new seek key is within the upper bound of the block + // Since we don't necessarily know the internal key for either + // the current key or the upper bound, we check user keys and + // exclude the equality case. Considering internal keys can + // improve for the boundary cases, but it would complicate the + // code. + if (user_comparator_.Compare(ExtractUserKey(*target), + block_iter_.user_key()) > 0 && + user_comparator_.Compare(ExtractUserKey(*target), + index_iter_->user_key()) < 0) { + need_seek_index = false; + } + } + } + + if (need_seek_index) { + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + IndexValue v = index_iter_->value(); + const bool same_block = block_iter_points_to_real_block_ && + v.handle.offset() == prev_block_offset_; + + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && !same_block && + (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block, and it's >= target. + // We can defer reading the block. + is_at_first_key_from_index_ = true; + // ResetDataIter() will invalidate block_iter_. Thus, there is no need to + // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound + // as that will be done later when the data block is actually read. + ResetDataIter(); + } else { + // Need to use the data block. + if (!same_block) { + InitDataBlock(); + } else { + // When the user does a reseek, the iterate_upper_bound might have + // changed. CheckDataBlockWithinUpperBound() needs to be called + // explicitly if the reseek ends up in the same data block. + // If the reseek ends up in a different block, InitDataBlock() will do + // the iterator upper bound check. + CheckDataBlockWithinUpperBound(); + } + + if (target) { + block_iter_.Seek(*target); + } else { + block_iter_.SeekToFirst(); + } + FindKeyForward(); + } + + CheckOutOfBound(); + + if (target) { + assert(!Valid() || ((block_type_ == BlockType::kIndex && + !table_->get_rep()->index_key_includes_seq) + ? (user_comparator_.Compare(ExtractUserKey(*target), + key()) <= 0) + : (icomp_.Compare(*target, key()) <= 0))); + } +} + +template +void BlockBasedTableIterator::SeekForPrev( + const Slice& target) { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + if (!CheckPrefixMayMatch(target)) { + ResetDataIter(); + return; + } + + SavePrevIndexValue(); + + // Call Seek() rather than SeekForPrev() in the index block, because the + // target data block will likely to contain the position for `target`, the + // same as Seek(), rather than than before. + // For example, if we have three data blocks, each containing two keys: + // [2, 4] [6, 8] [10, 12] + // (the keys in the index block would be [4, 8, 12]) + // and the user calls SeekForPrev(7), we need to go to the second block, + // just like if they call Seek(7). + // The only case where the block is difference is when they seek to a position + // in the boundary. For example, if they SeekForPrev(5), we should go to the + // first block, rather than the second. However, we don't have the information + // to distinguish the two unless we read the second block. In this case, we'll + // end up with reading two blocks. + index_iter_->Seek(target); + + if (!index_iter_->Valid()) { + if (!index_iter_->status().ok()) { + ResetDataIter(); + return; + } + + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + + InitDataBlock(); + + block_iter_.SeekForPrev(target); + + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); + assert(!block_iter_.Valid() || + icomp_.Compare(target, block_iter_.key()) >= 0); +} + +template +void BlockBasedTableIterator::SeekToLast() { + is_out_of_bound_ = false; + is_at_first_key_from_index_ = false; + SavePrevIndexValue(); + index_iter_->SeekToLast(); + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + InitDataBlock(); + block_iter_.SeekToLast(); + FindKeyBackward(); + CheckDataBlockWithinUpperBound(); +} + +template +void BlockBasedTableIterator::Next() { + if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) { + return; + } + assert(block_iter_points_to_real_block_); + block_iter_.Next(); + FindKeyForward(); + CheckOutOfBound(); +} + +template +bool BlockBasedTableIterator::NextAndGetResult( + IterateResult* result) { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; +} + +template +void BlockBasedTableIterator::Prev() { + if (is_at_first_key_from_index_) { + is_at_first_key_from_index_ = false; + + index_iter_->Prev(); + if (!index_iter_->Valid()) { + return; + } + + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + assert(block_iter_points_to_real_block_); + block_iter_.Prev(); + } + + FindKeyBackward(); +} + +template +void BlockBasedTableIterator::InitDataBlock() { + BlockHandle data_block_handle = index_iter_->value().handle; + if (!block_iter_points_to_real_block_ || + data_block_handle.offset() != prev_block_offset_ || + // if previous attempt of reading the block missed cache, try again + block_iter_.status().IsIncomplete()) { + if (block_iter_points_to_real_block_) { + ResetDataIter(); + } + auto* rep = table_->get_rep(); + + // Prefetch additional data for range scans (iterators). Enabled only for + // user reads. + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + if (lookup_context_.caller != TableReaderCaller::kCompaction) { + if (read_options_.readahead_size == 0) { + // Implicit auto readahead + num_file_reads_++; + if (num_file_reads_ > + BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) { + if (!rep->file->use_direct_io() && + (data_block_handle.offset() + + static_cast(block_size(data_block_handle)) > + readahead_limit_)) { + // Buffered I/O + // Discarding the return status of Prefetch calls intentionally, as + // we can fallback to reading from disk if Prefetch fails. + rep->file->Prefetch(data_block_handle.offset(), readahead_size_); + readahead_limit_ = static_cast(data_block_handle.offset() + + readahead_size_); + // Keep exponentially increasing readahead size until + // kMaxAutoReadaheadSize. + readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize, + readahead_size_ * 2); + } else if (rep->file->use_direct_io() && !prefetch_buffer_) { + // Direct I/O + // Let FilePrefetchBuffer take care of the readahead. + prefetch_buffer_.reset(new FilePrefetchBuffer( + rep->file.get(), BlockBasedTable::kInitAutoReadaheadSize, + BlockBasedTable::kMaxAutoReadaheadSize)); + } + } + } else if (!prefetch_buffer_) { + // Explicit user requested readahead + // The actual condition is: + // if (read_options_.readahead_size != 0 && !prefetch_buffer_) + prefetch_buffer_.reset(new FilePrefetchBuffer( + rep->file.get(), read_options_.readahead_size, + read_options_.readahead_size)); + } + } else if (!prefetch_buffer_) { + prefetch_buffer_.reset( + new FilePrefetchBuffer(rep->file.get(), compaction_readahead_size_, + compaction_readahead_size_)); + } + + Status s; + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, block_type_, + /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(), + /*for_compaction=*/lookup_context_.caller == + TableReaderCaller::kCompaction); + block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); + } +} + +template +bool BlockBasedTableIterator::MaterializeCurrentBlock() { + assert(is_at_first_key_from_index_); + assert(!block_iter_points_to_real_block_); + assert(index_iter_->Valid()); + + is_at_first_key_from_index_ = false; + InitDataBlock(); + assert(block_iter_points_to_real_block_); + block_iter_.SeekToFirst(); + + if (!block_iter_.Valid() || + icomp_.Compare(block_iter_.key(), + index_iter_->value().first_internal_key) != 0) { + // Uh oh. + block_iter_.Invalidate(Status::Corruption( + "first key in index doesn't match first key in block")); + return false; + } + + return true; +} + +template +void BlockBasedTableIterator::FindKeyForward() { + // This method's code is kept short to make it likely to be inlined. + + assert(!is_out_of_bound_); + assert(block_iter_points_to_real_block_); + + if (!block_iter_.Valid()) { + // This is the only call site of FindBlockForward(), but it's extracted into + // a separate method to keep FindKeyForward() short and likely to be + // inlined. When transitioning to a different block, we call + // FindBlockForward(), which is much longer and is probably not inlined. + FindBlockForward(); + } else { + // This is the fast path that avoids a function call. + } +} + +template +void BlockBasedTableIterator::FindBlockForward() { + // TODO the while loop inherits from two-level-iterator. We don't know + // whether a block can be empty so it can be replaced by an "if". + do { + if (!block_iter_.status().ok()) { + return; + } + // Whether next data block is out of upper bound, if there is one. + const bool next_block_is_out_of_bound = + read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && !data_block_within_upper_bound_; + assert(!next_block_is_out_of_bound || + user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) <= 0); + ResetDataIter(); + index_iter_->Next(); + if (next_block_is_out_of_bound) { + // The next block is out of bound. No need to read it. + TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); + // We need to make sure this is not the last data block before setting + // is_out_of_bound_, since the index key for the last data block can be + // larger than smallest key of the next file on the same level. + if (index_iter_->Valid()) { + is_out_of_bound_ = true; + } + return; + } + + if (!index_iter_->Valid()) { + return; + } + + IndexValue v = index_iter_->value(); + + // TODO(kolmike): Remove the != kBlockCacheTier condition. + if (!v.first_internal_key.empty() && + read_options_.read_tier != kBlockCacheTier) { + // Index contains the first key of the block. Defer reading the block. + is_at_first_key_from_index_ = true; + return; + } + + InitDataBlock(); + block_iter_.SeekToFirst(); + } while (!block_iter_.Valid()); +} + +template +void BlockBasedTableIterator::FindKeyBackward() { + while (!block_iter_.Valid()) { + if (!block_iter_.status().ok()) { + return; + } + + ResetDataIter(); + index_iter_->Prev(); + + if (index_iter_->Valid()) { + InitDataBlock(); + block_iter_.SeekToLast(); + } else { + return; + } + } + + // We could have check lower bound here too, but we opt not to do it for + // code simplicity. +} + +template +void BlockBasedTableIterator::CheckOutOfBound() { + if (read_options_.iterate_upper_bound != nullptr && Valid()) { + is_out_of_bound_ = user_comparator_.Compare( + *read_options_.iterate_upper_bound, user_key()) <= 0; + } +} + +template +void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + data_block_within_upper_bound_ = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) > 0); + } +} + +InternalIterator* BlockBasedTable::NewIterator( + const ReadOptions& read_options, const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, TableReaderCaller caller, + size_t compaction_readahead_size) { + BlockCacheLookupContext lookup_context{caller}; + bool need_upper_bound_check = + PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); + if (arena == nullptr) { + return new BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, + NewIndexIterator( + read_options, + need_upper_bound_check && + rep_->index_type == BlockBasedTableOptions::kHashSearch, + /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, BlockType::kData, caller, + compaction_readahead_size); + } else { + auto* mem = + arena->AllocateAligned(sizeof(BlockBasedTableIterator)); + return new (mem) BlockBasedTableIterator( + this, read_options, rep_->internal_comparator, + NewIndexIterator( + read_options, + need_upper_bound_check && + rep_->index_type == BlockBasedTableOptions::kHashSearch, + /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context), + !skip_filters && !read_options.total_order_seek && + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, BlockType::kData, caller, + compaction_readahead_size); + } +} + +FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( + const ReadOptions& read_options) { + if (rep_->fragmented_range_dels == nullptr) { + return nullptr; + } + SequenceNumber snapshot = kMaxSequenceNumber; + if (read_options.snapshot != nullptr) { + snapshot = read_options.snapshot->GetSequenceNumber(); + } + return new FragmentedRangeTombstoneIterator( + rep_->fragmented_range_dels, rep_->internal_comparator, snapshot); +} + +bool BlockBasedTable::FullFilterKeyMayMatch( + const ReadOptions& read_options, FilterBlockReader* filter, + const Slice& internal_key, const bool no_io, + const SliceTransform* prefix_extractor, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + if (filter == nullptr || filter->IsBlockBased()) { + return true; + } + Slice user_key = ExtractUserKey(internal_key); + const Slice* const const_ikey_ptr = &internal_key; + bool may_match = true; + if (rep_->whole_key_filtering) { + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); + may_match = + filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid, + no_io, const_ikey_ptr, get_context, lookup_context); + } else if (!read_options.total_order_seek && prefix_extractor && + rep_->table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) == 0 && + prefix_extractor->InDomain(user_key) && + !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), + prefix_extractor, kNotValid, no_io, + const_ikey_ptr, get_context, + lookup_context)) { + may_match = false; + } + if (may_match) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + } + return may_match; +} + +void BlockBasedTable::FullFilterKeysMayMatch( + const ReadOptions& read_options, FilterBlockReader* filter, + MultiGetRange* range, const bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const { + if (filter == nullptr || filter->IsBlockBased()) { + return; + } + if (rep_->whole_key_filtering) { + filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io, + lookup_context); + } else if (!read_options.total_order_seek && prefix_extractor && + rep_->table_properties->prefix_extractor_name.compare( + prefix_extractor->Name()) == 0) { + filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false, + lookup_context); + } +} + +Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, + GetContext* get_context, + const SliceTransform* prefix_extractor, + bool skip_filters) { + assert(key.size() >= 8); // key must be internal key + assert(get_context != nullptr); + Status s; + const bool no_io = read_options.read_tier == kBlockCacheTier; + + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + + // First check the full filter + // If full filter not useful, Then go into each block + uint64_t tracing_get_id = get_context->get_tracing_get_id(); + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Trace the key since it contains both user key and sequence number. + lookup_context.referenced_key = key.ToString(); + lookup_context.get_from_user_specified_snapshot = + read_options.snapshot != nullptr; + } + const bool may_match = + FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor, + get_context, &lookup_context); + if (!may_match) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + } else { + IndexBlockIter iiter_on_stack; + // if prefix_extractor found in block differs from options, disable + // BlockPrefixIndex. Only do this check when index_type is kHashSearch. + bool need_upper_bound_check = false; + if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { + need_upper_bound_check = PrefixExtractorChanged( + rep_->table_properties.get(), prefix_extractor); + } + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + get_context, &lookup_context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + + size_t ts_sz = + rep_->internal_comparator.user_comparator()->timestamp_size(); + bool matched = false; // if such user key mathced a key in SST + bool done = false; + for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { + IndexValue v = iiter->value(); + + bool not_exist_in_filter = + filter != nullptr && filter->IsBlockBased() == true && + !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz), + prefix_extractor, v.handle.offset(), no_io, + /*const_ikey_ptr=*/nullptr, get_context, + &lookup_context); + + if (not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + break; + } + + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + + BlockCacheLookupContext lookup_data_block_context{ + TableReaderCaller::kUserGet, tracing_get_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != + nullptr}; + bool does_referenced_key_exist = false; + DataBlockIter biter; + uint64_t referenced_data_size = 0; + NewDataBlockIterator( + read_options, v.handle, &biter, BlockType::kData, get_context, + &lookup_data_block_context, + /*s=*/Status(), /*prefetch_buffer*/ nullptr); + + if (no_io && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter.status().ok()) { + s = biter.status(); + break; + } + + bool may_exist = biter.SeekForGet(key); + // If user-specified timestamp is supported, we cannot end the search + // just because hash index lookup indicates the key+ts does not exist. + if (!may_exist && ts_sz == 0) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + done = true; + } else { + // Call the *saver function on each entry/block until it returns false + for (; biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!get_context->SaveValue( + parsed_key, biter.value(), &matched, + biter.IsValuePinned() ? &biter : nullptr)) { + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = biter.key().size() + biter.value().size(); + } + done = true; + break; + } + } + s = biter.status(); + } + // Write the block cache access record. + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter.key(); + } else { + referenced_key = key; + } + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + lookup_data_block_context.get_id, + lookup_data_block_context.get_from_user_specified_snapshot, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), referenced_key); + } + + if (done) { + // Avoid the extra Next which is expensive in two-level indexes + break; + } + } + if (matched && filter != nullptr && !filter->IsBlockBased()) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, + rep_->level); + } + if (s.ok()) { + s = iiter->status(); + } + } + + return s; +} + +using MultiGetRange = MultiGetContext::Range; +void BlockBasedTable::MultiGet(const ReadOptions& read_options, + const MultiGetRange* mget_range, + const SliceTransform* prefix_extractor, + bool skip_filters) { + FilterBlockReader* const filter = + !skip_filters ? rep_->filter.get() : nullptr; + MultiGetRange sst_file_range(*mget_range, mget_range->begin(), + mget_range->end()); + + // First check the full filter + // If full filter not useful, Then go into each block + const bool no_io = read_options.read_tier == kBlockCacheTier; + uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId; + if (!sst_file_range.empty() && sst_file_range.begin()->get_context) { + tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); + } + BlockCacheLookupContext lookup_context{ + TableReaderCaller::kUserMultiGet, tracing_mget_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; + FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, + prefix_extractor, &lookup_context); + + if (skip_filters || !sst_file_range.empty()) { + IndexBlockIter iiter_on_stack; + // if prefix_extractor found in block differs from options, disable + // BlockPrefixIndex. Only do this check when index_type is kHashSearch. + bool need_upper_bound_check = false; + if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { + need_upper_bound_check = PrefixExtractorChanged( + rep_->table_properties.get(), prefix_extractor); + } + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + sst_file_range.begin()->get_context, &lookup_context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr.reset(iiter); + } + + uint64_t offset = std::numeric_limits::max(); + autovector block_handles; + autovector, MultiGetContext::MAX_BATCH_SIZE> results; + autovector statuses; + char stack_buf[kMultiGetReadStackBufSize]; + std::unique_ptr block_buf; + { + MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), + sst_file_range.end()); + + CachableEntry uncompression_dict; + Status uncompression_dict_status; + if (rep_->uncompression_dict_reader) { + uncompression_dict_status = + rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, no_io, + sst_file_range.begin()->get_context, &lookup_context, + &uncompression_dict); + } + + const UncompressionDict& dict = uncompression_dict.GetValue() + ? *uncompression_dict.GetValue() + : UncompressionDict::GetEmptyDict(); + + size_t total_len = 0; + ReadOptions ro = read_options; + ro.read_tier = kBlockCacheTier; + + for (auto miter = data_block_range.begin(); + miter != data_block_range.end(); ++miter) { + const Slice& key = miter->ikey; + iiter->Seek(miter->ikey); + + IndexValue v; + if (iiter->Valid()) { + v = iiter->value(); + } + if (!iiter->Valid() || + (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0)) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + *(miter->s) = iiter->status(); + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + + if (!uncompression_dict_status.ok()) { + *(miter->s) = uncompression_dict_status; + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + + statuses.emplace_back(); + results.emplace_back(); + if (v.handle.offset() == offset) { + // We're going to reuse the block for this key later on. No need to + // look it up now. Place a null handle + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + continue; + } + // Lookup the cache for the given data block referenced by an index + // iterator value (i.e BlockHandle). If it exists in the cache, + // initialize block to the contents of the data block. + offset = v.handle.offset(); + BlockHandle handle = v.handle; + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet); + Status s = RetrieveBlock( + nullptr, ro, handle, dict, &(results.back()), BlockType::kData, + miter->get_context, &lookup_data_block_context, + /* for_compaction */ false, /* use_cache */ true); + if (s.IsIncomplete()) { + s = Status::OK(); + } + if (s.ok() && !results.back().IsEmpty()) { + // Found it in the cache. Add NULL handle to indicate there is + // nothing to read from disk + block_handles.emplace_back(BlockHandle::NullBlockHandle()); + } else { + block_handles.emplace_back(handle); + total_len += block_size(handle); + } + } + + if (total_len) { + char* scratch = nullptr; + // If the blocks need to be uncompressed and we don't need the + // compressed blocks, then we can use a contiguous block of + // memory to read in all the blocks as it will be temporary + // storage + // 1. If blocks are compressed and compressed block cache is there, + // alloc heap bufs + // 2. If blocks are uncompressed, alloc heap bufs + // 3. If blocks are compressed and no compressed block cache, use + // stack buf + if (rep_->table_options.block_cache_compressed == nullptr && + rep_->blocks_maybe_compressed) { + if (total_len <= kMultiGetReadStackBufSize) { + scratch = stack_buf; + } else { + scratch = new char[total_len]; + block_buf.reset(scratch); + } + } + RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles, + &statuses, &results, scratch, dict); + } + } + + DataBlockIter first_biter; + DataBlockIter next_biter; + size_t idx_in_batch = 0; + for (auto miter = sst_file_range.begin(); miter != sst_file_range.end(); + ++miter) { + Status s; + GetContext* get_context = miter->get_context; + const Slice& key = miter->ikey; + bool matched = false; // if such user key matched a key in SST + bool done = false; + bool first_block = true; + do { + DataBlockIter* biter = nullptr; + bool reusing_block = true; + uint64_t referenced_data_size = 0; + bool does_referenced_key_exist = false; + BlockCacheLookupContext lookup_data_block_context( + TableReaderCaller::kUserMultiGet, tracing_mget_id, + /*get_from_user_specified_snapshot=*/read_options.snapshot != + nullptr); + if (first_block) { + if (!block_handles[idx_in_batch].IsNull() || + !results[idx_in_batch].IsEmpty()) { + first_biter.Invalidate(Status::OK()); + NewDataBlockIterator( + read_options, results[idx_in_batch], &first_biter, + statuses[idx_in_batch]); + reusing_block = false; + } + biter = &first_biter; + idx_in_batch++; + } else { + IndexValue v = iiter->value(); + if (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper(rep_->internal_comparator.user_comparator()) + .Compare(ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + break; + } + + next_biter.Invalidate(Status::OK()); + NewDataBlockIterator( + read_options, iiter->value().handle, &next_biter, + BlockType::kData, get_context, &lookup_data_block_context, + Status(), nullptr); + biter = &next_biter; + reusing_block = false; + } + + if (read_options.read_tier == kBlockCacheTier && + biter->status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for + // whether we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); + break; + } + if (!biter->status().ok()) { + s = biter->status(); + break; + } + + bool may_exist = biter->SeekForGet(key); + if (!may_exist) { + // HashSeek cannot find the key this block and the the iter is not + // the end of the block, i.e. cannot be in the following blocks + // either. In this case, the seek_key cannot be found, so we break + // from the top level for-loop. + break; + } + + // Call the *saver function on each entry/block until it returns false + for (; biter->Valid(); biter->Next()) { + ParsedInternalKey parsed_key; + Cleanable dummy; + Cleanable* value_pinner = nullptr; + if (!ParseInternalKey(biter->key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + if (biter->IsValuePinned()) { + if (reusing_block) { + Cache* block_cache = rep_->table_options.block_cache.get(); + assert(biter->cache_handle() != nullptr); + block_cache->Ref(biter->cache_handle()); + dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache, + biter->cache_handle()); + value_pinner = &dummy; + } else { + value_pinner = biter; + } + } + if (!get_context->SaveValue(parsed_key, biter->value(), &matched, + value_pinner)) { + if (get_context->State() == GetContext::GetState::kFound) { + does_referenced_key_exist = true; + referenced_data_size = + biter->key().size() + biter->value().size(); + } + done = true; + break; + } + s = biter->status(); + } + // Write the block cache access. + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Avoid making copy of block_key, cf_name, and referenced_key when + // constructing the access record. + Slice referenced_key; + if (does_referenced_key_exist) { + referenced_key = biter->key(); + } else { + referenced_key = key; + } + BlockCacheTraceRecord access_record( + rep_->ioptions.env->NowMicros(), + /*block_key=*/"", lookup_data_block_context.block_type, + lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), + rep_->sst_number_for_tracing(), lookup_data_block_context.caller, + lookup_data_block_context.is_cache_hit, + lookup_data_block_context.no_insert, + lookup_data_block_context.get_id, + lookup_data_block_context.get_from_user_specified_snapshot, + /*referenced_key=*/"", referenced_data_size, + lookup_data_block_context.num_keys_in_block, + does_referenced_key_exist); + block_cache_tracer_->WriteBlockAccess( + access_record, lookup_data_block_context.block_key, + rep_->cf_name_for_tracing(), referenced_key); + } + s = biter->status(); + if (done) { + // Avoid the extra Next which is expensive in two-level indexes + break; + } + if (first_block) { + iiter->Seek(key); + } + first_block = false; + iiter->Next(); + } while (iiter->Valid()); + + if (matched && filter != nullptr && !filter->IsBlockBased()) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, + rep_->level); + } + if (s.ok()) { + s = iiter->status(); + } + *(miter->s) = s; + } + } +} + +Status BlockBasedTable::Prefetch(const Slice* const begin, + const Slice* const end) { + auto& comparator = rep_->internal_comparator; + UserComparatorWrapper user_comparator(comparator.user_comparator()); + // pre-condition + if (begin && end && comparator.Compare(*begin, *end) > 0) { + return Status::InvalidArgument(*begin, *end); + } + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + IndexBlockIter iiter_on_stack; + auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + &iiter_on_stack, /*get_context=*/nullptr, + &lookup_context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr>(iiter); + } + + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + + // indicates if we are on the last page that need to be pre-fetched + bool prefetching_boundary_page = false; + + for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); + iiter->Next()) { + BlockHandle block_handle = iiter->value().handle; + const bool is_user_key = !rep_->index_key_includes_seq; + if (end && + ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) || + (is_user_key && + user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { + if (prefetching_boundary_page) { + break; + } + + // The index entry represents the last key in the data block. + // We should load this page into memory as well, but no more + prefetching_boundary_page = true; + } + + // Load the block specified by the block_handle into the block cache + DataBlockIter biter; + + NewDataBlockIterator( + ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData, + /*get_context=*/nullptr, &lookup_context, Status(), + /*prefetch_buffer=*/nullptr); + + if (!biter.status().ok()) { + // there was an unexpected error while pre-fetching + return biter.status(); + } + } + + return Status::OK(); +} + +Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, + TableReaderCaller caller) { + Status s; + // Check Meta blocks + std::unique_ptr metaindex; + std::unique_ptr metaindex_iter; + s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex, + &metaindex_iter); + if (s.ok()) { + s = VerifyChecksumInMetaBlocks(metaindex_iter.get()); + if (!s.ok()) { + return s; + } + } else { + return s; + } + // Check Data blocks + IndexBlockIter iiter_on_stack; + BlockCacheLookupContext context{caller}; + InternalIteratorBase* iiter = NewIndexIterator( + read_options, /*disable_prefix_seek=*/false, &iiter_on_stack, + /*get_context=*/nullptr, &context); + std::unique_ptr> iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr>(iiter); + } + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + s = VerifyChecksumInBlocks(read_options, iiter); + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks( + const ReadOptions& read_options, + InternalIteratorBase* index_iter) { + Status s; + // We are scanning the whole file, so no need to do exponential + // increasing of the buffer size. + size_t readahead_size = (read_options.readahead_size != 0) + ? read_options.readahead_size + : kMaxAutoReadaheadSize; + // FilePrefetchBuffer doesn't work in mmap mode and readahead is not + // needed there. + FilePrefetchBuffer prefetch_buffer( + rep_->file.get(), readahead_size /* readadhead_size */, + readahead_size /* max_readahead_size */, + !rep_->ioptions.allow_mmap_reads /* enable */); + + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle = index_iter->value().handle; + BlockContents contents; + BlockFetcher block_fetcher( + rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle, + &contents, rep_->ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kData, + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (!s.ok()) { + break; + } + } + return s; +} + +BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( + const Slice& meta_block_name) { + if (meta_block_name.starts_with(kFilterBlockPrefix) || + meta_block_name.starts_with(kFullFilterBlockPrefix) || + meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) { + return BlockType::kFilter; + } + + if (meta_block_name == kPropertiesBlock) { + return BlockType::kProperties; + } + + if (meta_block_name == kCompressionDictBlock) { + return BlockType::kCompressionDictionary; + } + + if (meta_block_name == kRangeDelBlock) { + return BlockType::kRangeDeletion; + } + + if (meta_block_name == kHashIndexPrefixesBlock) { + return BlockType::kHashIndexPrefixes; + } + + if (meta_block_name == kHashIndexPrefixesMetadataBlock) { + return BlockType::kHashIndexMetadata; + } + + assert(false); + return BlockType::kInvalid; +} + +Status BlockBasedTable::VerifyChecksumInMetaBlocks( + InternalIteratorBase* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); + BlockContents contents; + const Slice meta_block_name = index_iter->key(); + BlockFetcher block_fetcher( + rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, + ReadOptions(), handle, &contents, rep_->ioptions, + false /* decompress */, false /*maybe_compressed*/, + GetBlockTypeForMetaBlockByName(meta_block_name), + UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); + s = block_fetcher.ReadBlockContents(); + if (s.IsCorruption() && meta_block_name == kPropertiesBlock) { + TableProperties* table_properties; + s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */, + index_iter->value(), + &table_properties); + delete table_properties; + } + if (!s.ok()) { + break; + } + } + return s; +} + +bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const { + assert(rep_ != nullptr); + + Cache* const cache = rep_->table_options.block_cache.get(); + if (cache == nullptr) { + return false; + } + + char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; + Slice cache_key = + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, + cache_key_storage); + + Cache::Handle* const cache_handle = cache->Lookup(cache_key); + if (cache_handle == nullptr) { + return false; + } + + cache->Release(cache_handle); + + return true; +} + +bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, + const Slice& key) { + std::unique_ptr> iiter(NewIndexIterator( + options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); + iiter->Seek(key); + assert(iiter->Valid()); + + return TEST_BlockInCache(iiter->value().handle); +} + +// REQUIRES: The following fields of rep_ should have already been populated: +// 1. file +// 2. index_handle, +// 3. options +// 4. internal_comparator +// 5. index_type +Status BlockBasedTable::CreateIndexReader( + FilePrefetchBuffer* prefetch_buffer, + InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader) { + // kHashSearch requires non-empty prefix_extractor but bypass checking + // prefix_extractor here since we have no access to MutableCFOptions. + // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. + // If prefix_extractor does not match prefix_extractor_name from table + // properties, turn off Hash Index by setting total_order_seek to true + + switch (rep_->index_type) { + case BlockBasedTableOptions::kTwoLevelIndexSearch: { + return PartitionIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context, + index_reader); + } + case BlockBasedTableOptions::kBinarySearch: + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { + return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache, + prefetch, pin, lookup_context, + index_reader); + } + case BlockBasedTableOptions::kHashSearch: { + std::unique_ptr metaindex_guard; + std::unique_ptr metaindex_iter_guard; + auto meta_index_iter = preloaded_meta_index_iter; + if (meta_index_iter == nullptr) { + auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard, + &metaindex_iter_guard); + if (!s.ok()) { + // we simply fall back to binary search in case there is any + // problem with prefix hash index loading. + ROCKS_LOG_WARN(rep_->ioptions.info_log, + "Unable to read the metaindex block." + " Fall back to binary search index."); + return BinarySearchIndexReader::Create(this, prefetch_buffer, + use_cache, prefetch, pin, + lookup_context, index_reader); + } + meta_index_iter = metaindex_iter_guard.get(); + } + + return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter, + use_cache, prefetch, pin, lookup_context, + index_reader); + } + default: { + std::string error_message = + "Unrecognized index type: " + ToString(rep_->index_type); + return Status::InvalidArgument(error_message.c_str()); + } + } +} + +uint64_t BlockBasedTable::ApproximateOffsetOf( + const InternalIteratorBase& index_iter) const { + uint64_t result = 0; + if (index_iter.Valid()) { + BlockHandle handle = index_iter.value().handle; + result = handle.offset(); + } else { + // The iterator is past the last key in the file. If table_properties is not + // available, approximate the offset by returning the offset of the + // metaindex block (which is right near the end of the file). + if (rep_->table_properties) { + result = rep_->table_properties->data_size; + } + // table_properties is not present in the table. + if (result == 0) { + result = rep_->footer.metaindex_handle().offset(); + } + } + + return result; +} + +uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) { + BlockCacheLookupContext context(caller); + IndexBlockIter iiter_on_stack; + auto index_iter = + NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false, + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); + std::unique_ptr> iiter_unique_ptr; + if (index_iter != &iiter_on_stack) { + iiter_unique_ptr.reset(index_iter); + } + + index_iter->Seek(key); + return ApproximateOffsetOf(*index_iter); +} + +uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) { + assert(rep_->internal_comparator.Compare(start, end) <= 0); + + BlockCacheLookupContext context(caller); + IndexBlockIter iiter_on_stack; + auto index_iter = + NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false, + /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, + /*lookup_context=*/&context); + std::unique_ptr> iiter_unique_ptr; + if (index_iter != &iiter_on_stack) { + iiter_unique_ptr.reset(index_iter); + } + + index_iter->Seek(start); + uint64_t start_offset = ApproximateOffsetOf(*index_iter); + index_iter->Seek(end); + uint64_t end_offset = ApproximateOffsetOf(*index_iter); + + assert(end_offset >= start_offset); + return end_offset - start_offset; +} + +bool BlockBasedTable::TEST_FilterBlockInCache() const { + assert(rep_ != nullptr); + return TEST_BlockInCache(rep_->filter_handle); +} + +bool BlockBasedTable::TEST_IndexBlockInCache() const { + assert(rep_ != nullptr); + + return TEST_BlockInCache(rep_->footer.index_handle()); +} + +Status BlockBasedTable::GetKVPairsFromDataBlocks( + std::vector* kv_pair_blocks) { + std::unique_ptr> blockhandles_iter( + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + + Status s = blockhandles_iter->status(); + if (!s.ok()) { + // Cannot read Index Block + return s; + } + + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + + if (!s.ok()) { + break; + } + + std::unique_ptr datablock_iter; + datablock_iter.reset(NewDataBlockIterator( + ReadOptions(), blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, + /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), + /*prefetch_buffer=*/nullptr)); + s = datablock_iter->status(); + + if (!s.ok()) { + // Error reading the block - Skipped + continue; + } + + KVPairBlock kv_pair_block; + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + // Error reading the block - Skipped + break; + } + const Slice& key = datablock_iter->key(); + const Slice& value = datablock_iter->value(); + std::string key_copy = std::string(key.data(), key.size()); + std::string value_copy = std::string(value.data(), value.size()); + + kv_pair_block.push_back( + std::make_pair(std::move(key_copy), std::move(value_copy))); + } + kv_pair_blocks->push_back(std::move(kv_pair_block)); + } + return Status::OK(); +} + +Status BlockBasedTable::DumpTable(WritableFile* out_file) { + // Output Footer + out_file->Append( + "Footer Details:\n" + "--------------------------------------\n" + " "); + out_file->Append(rep_->footer.ToString().c_str()); + out_file->Append("\n"); + + // Output MetaIndex + out_file->Append( + "Metaindex Details:\n" + "--------------------------------------\n"); + std::unique_ptr metaindex; + std::unique_ptr metaindex_iter; + Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex, + &metaindex_iter); + if (s.ok()) { + for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); + metaindex_iter->Next()) { + s = metaindex_iter->status(); + if (!s.ok()) { + return s; + } + if (metaindex_iter->key() == rocksdb::kPropertiesBlock) { + out_file->Append(" Properties block handle: "); + out_file->Append(metaindex_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (metaindex_iter->key() == rocksdb::kCompressionDictBlock) { + out_file->Append(" Compression dictionary block handle: "); + out_file->Append(metaindex_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (strstr(metaindex_iter->key().ToString().c_str(), + "filter.rocksdb.") != nullptr) { + out_file->Append(" Filter block handle: "); + out_file->Append(metaindex_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (metaindex_iter->key() == rocksdb::kRangeDelBlock) { + out_file->Append(" Range deletion block handle: "); + out_file->Append(metaindex_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } + } + out_file->Append("\n"); + } else { + return s; + } + + // Output TableProperties + const rocksdb::TableProperties* table_properties; + table_properties = rep_->table_properties.get(); + + if (table_properties != nullptr) { + out_file->Append( + "Table Properties:\n" + "--------------------------------------\n" + " "); + out_file->Append(table_properties->ToString("\n ", ": ").c_str()); + out_file->Append("\n"); + } + + if (rep_->filter) { + out_file->Append( + "Filter Details:\n" + "--------------------------------------\n" + " "); + out_file->Append(rep_->filter->ToString().c_str()); + out_file->Append("\n"); + } + + // Output Index block + s = DumpIndexBlock(out_file); + if (!s.ok()) { + return s; + } + + // Output compression dictionary + if (rep_->uncompression_dict_reader) { + CachableEntry uncompression_dict; + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, false /* no_io */, + nullptr /* get_context */, nullptr /* lookup_context */, + &uncompression_dict); + if (!s.ok()) { + return s; + } + + assert(uncompression_dict.GetValue()); + + const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict(); + out_file->Append( + "Compression Dictionary:\n" + "--------------------------------------\n"); + out_file->Append(" size (bytes): "); + out_file->Append(rocksdb::ToString(raw_dict.size())); + out_file->Append("\n\n"); + out_file->Append(" HEX "); + out_file->Append(raw_dict.ToString(true).c_str()); + out_file->Append("\n\n"); + } + + // Output range deletions block + auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions()); + if (range_del_iter != nullptr) { + range_del_iter->SeekToFirst(); + if (range_del_iter->Valid()) { + out_file->Append( + "Range deletions:\n" + "--------------------------------------\n" + " "); + for (; range_del_iter->Valid(); range_del_iter->Next()) { + DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file); + } + out_file->Append("\n"); + } + delete range_del_iter; + } + // Output Data blocks + s = DumpDataBlocks(out_file); + + return s; +} + +Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { + out_file->Append( + "Index Details:\n" + "--------------------------------------\n"); + std::unique_ptr> blockhandles_iter( + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_file->Append("Can not read Index Block \n\n"); + return s; + } + + out_file->Append(" Block key hex dump: Data block handle\n"); + out_file->Append(" Block key ascii\n\n"); + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + Slice key = blockhandles_iter->key(); + Slice user_key; + InternalKey ikey; + if (!rep_->index_key_includes_seq) { + user_key = key; + } else { + ikey.DecodeFrom(key); + user_key = ikey.user_key(); + } + + out_file->Append(" HEX "); + out_file->Append(user_key.ToString(true).c_str()); + out_file->Append(": "); + out_file->Append(blockhandles_iter->value() + .ToString(true, rep_->index_has_first_key) + .c_str()); + out_file->Append("\n"); + + std::string str_key = user_key.ToString(); + std::string res_key(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + res_key.append(&str_key[i], 1); + res_key.append(1, cspace); + } + out_file->Append(" ASCII "); + out_file->Append(res_key.c_str()); + out_file->Append("\n ------\n"); + } + out_file->Append("\n"); + return Status::OK(); +} + +Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { + std::unique_ptr> blockhandles_iter( + NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + /*input_iter=*/nullptr, /*get_context=*/nullptr, + /*lookup_contex=*/nullptr)); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_file->Append("Can not read Index Block \n\n"); + return s; + } + + uint64_t datablock_size_min = std::numeric_limits::max(); + uint64_t datablock_size_max = 0; + uint64_t datablock_size_sum = 0; + + size_t block_id = 1; + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + block_id++, blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + + BlockHandle bh = blockhandles_iter->value().handle; + uint64_t datablock_size = bh.size(); + datablock_size_min = std::min(datablock_size_min, datablock_size); + datablock_size_max = std::max(datablock_size_max, datablock_size); + datablock_size_sum += datablock_size; + + out_file->Append("Data Block # "); + out_file->Append(rocksdb::ToString(block_id)); + out_file->Append(" @ "); + out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str()); + out_file->Append("\n"); + out_file->Append("--------------------------------------\n"); + + std::unique_ptr datablock_iter; + datablock_iter.reset(NewDataBlockIterator( + ReadOptions(), blockhandles_iter->value().handle, + /*input_iter=*/nullptr, /*type=*/BlockType::kData, + /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(), + /*prefetch_buffer=*/nullptr)); + s = datablock_iter->status(); + + if (!s.ok()) { + out_file->Append("Error reading the block - Skipped \n\n"); + continue; + } + + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + out_file->Append("Error reading the block - Skipped \n"); + break; + } + DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file); + } + out_file->Append("\n"); + } + + uint64_t num_datablocks = block_id - 1; + if (num_datablocks) { + double datablock_size_avg = + static_cast(datablock_size_sum) / num_datablocks; + out_file->Append("Data Block Summary:\n"); + out_file->Append("--------------------------------------"); + out_file->Append("\n # data blocks: "); + out_file->Append(rocksdb::ToString(num_datablocks)); + out_file->Append("\n min data block size: "); + out_file->Append(rocksdb::ToString(datablock_size_min)); + out_file->Append("\n max data block size: "); + out_file->Append(rocksdb::ToString(datablock_size_max)); + out_file->Append("\n avg data block size: "); + out_file->Append(rocksdb::ToString(datablock_size_avg)); + out_file->Append("\n"); + } + + return Status::OK(); +} + +void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, + WritableFile* out_file) { + InternalKey ikey; + ikey.DecodeFrom(key); + + out_file->Append(" HEX "); + out_file->Append(ikey.user_key().ToString(true).c_str()); + out_file->Append(": "); + out_file->Append(value.ToString(true).c_str()); + out_file->Append("\n"); + + std::string str_key = ikey.user_key().ToString(); + std::string str_value = value.ToString(); + std::string res_key(""), res_value(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + if (str_key[i] == '\0') { + res_key.append("\\0", 2); + } else { + res_key.append(&str_key[i], 1); + } + res_key.append(1, cspace); + } + for (size_t i = 0; i < str_value.size(); i++) { + if (str_value[i] == '\0') { + res_value.append("\\0", 2); + } else { + res_value.append(&str_value[i], 1); + } + res_value.append(1, cspace); + } + + out_file->Append(" ASCII "); + out_file->Append(res_key.c_str()); + out_file->Append(": "); + out_file->Append(res_value.c_str()); + out_file->Append("\n ------\n"); +} + +} // namespace rocksdb diff --git a/table/block_based_table_reader.h b/table/block_based/block_based_table_reader.h similarity index 55% rename from table/block_based_table_reader.h rename to table/block_based/block_based_table_reader.h index 1fcc8cbfa07..bcc7fd1a3da 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -17,15 +17,20 @@ #include #include "db/range_tombstone_fragmenter.h" +#include "file/filename.h" +#include "file/random_access_file_reader.h" #include "options/cf_options.h" #include "rocksdb/options.h" #include "rocksdb/persistent_cache.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_based_table_factory.h" -#include "table/filter_block.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_type.h" +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" +#include "table/block_based/uncompression_dict_reader.h" #include "table/format.h" #include "table/get_context.h" #include "table/multiget_context.h" @@ -33,13 +38,12 @@ #include "table/table_properties_internal.h" #include "table/table_reader.h" #include "table/two_level_iterator.h" +#include "trace_replay/block_cache_tracer.h" #include "util/coding.h" -#include "util/file_reader_writer.h" #include "util/user_comparator_wrapper.h" namespace rocksdb { -class BlockHandle; class Cache; class FilterBlockReader; class BlockBasedFilterBlockReader; @@ -58,9 +62,17 @@ class GetContext; typedef std::vector> KVPairBlock; -// A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. A Table may be safely accessed from -// multiple threads without external synchronization. +// Reader class for BlockBasedTable format. +// For the format of BlockBasedTable refer to +// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format. +// This is the default table type. Data is chucked into fixed size blocks and +// each block in-turn stores entries. When storing data, we can compress and/or +// encode data efficiently within a block, which often results in a much smaller +// data size compared with the raw data size. As for the record retrieval, we'll +// first locate the block where target record may reside, then read the block to +// memory, and finally search that record within the block. Of course, to avoid +// frequent reads of the same block, we introduced the block cache to keep the +// loaded blocks in the memory. class BlockBasedTable : public TableReader { public: static const std::string kFilterBlockPrefix; @@ -70,6 +82,13 @@ class BlockBasedTable : public TableReader { // For Posix files the unique ID is three varints. static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; + // All the below fields control iterator readahead + static const size_t kInitAutoReadaheadSize = 8 * 1024; + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + static const size_t kMaxAutoReadaheadSize; + static const int kMinNumFileReadsToStartAutoReadahead = 2; + // Attempt to open the table that is stored in bytes [0..file_size) // of "file", and read the metadata entries necessary to allow // retrieving data from the table. @@ -98,22 +117,26 @@ class BlockBasedTable : public TableReader { bool skip_filters = false, int level = -1, const bool immortal_table = false, const SequenceNumber largest_seqno = 0, - TailPrefetchStats* tail_prefetch_stats = nullptr); + TailPrefetchStats* tail_prefetch_stats = nullptr, + BlockCacheTracer* const block_cache_tracer = nullptr); bool PrefixMayMatch(const Slice& internal_key, const ReadOptions& read_options, const SliceTransform* options_prefix_extractor, - const bool need_upper_bound_check); + const bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) const; // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). // @param skip_filters Disables loading/accessing the filter block + // compaction_readahead_size: its value will only be used if caller = + // kCompaction. InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false) override; + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0) override; FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& read_options) override; @@ -135,11 +158,21 @@ class BlockBasedTable : public TableReader { // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were - // present in the file). The returned value is in terms of file + // present in the file). The returned value is in terms of file // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key) override; + uint64_t ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) override; + + // Given start and end keys, return the approximate data size in the file + // between the keys. The returned value is in terms of file bytes, and so + // includes effects like compression of the underlying data. + // The start key must not be greater than the end key. + uint64_t ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) override; + + bool TEST_BlockInCache(const BlockHandle& handle) const; // Returns true if the block for the specified key is in cache. // REQUIRES: key is in this table && block cache enabled @@ -154,64 +187,44 @@ class BlockBasedTable : public TableReader { size_t ApproximateMemoryUsage() const override; // convert SST file to a human readable form - Status DumpTable(WritableFile* out_file, - const SliceTransform* prefix_extractor = nullptr) override; + Status DumpTable(WritableFile* out_file) override; - Status VerifyChecksum() override; - - void Close() override; + Status VerifyChecksum(const ReadOptions& readOptions, + TableReaderCaller caller) override; ~BlockBasedTable(); - bool TEST_filter_block_preloaded() const; - bool TEST_index_reader_preloaded() const; + bool TEST_FilterBlockInCache() const; + bool TEST_IndexBlockInCache() const; - // IndexReader is the interface that provide the functionality for index + // IndexReader is the interface that provides the functionality for index // access. class IndexReader { public: - explicit IndexReader(const InternalKeyComparator* icomparator, - Statistics* stats) - : icomparator_(icomparator), statistics_(stats) {} - - virtual ~IndexReader() {} - - // Create an iterator for index access. - // If iter is null then a new object is created on heap and the callee will - // have the ownership. If a non-null iter is passed in it will be used, and - // the returned value is either the same as iter or a new on-heap object - // that - // wrapps the passed iter. In the latter case the return value would point - // to - // a different object then iter and the callee has the ownership of the + virtual ~IndexReader() = default; + + // Create an iterator for index access. If iter is null, then a new object + // is created on the heap, and the callee will have the ownership. + // If a non-null iter is passed in, it will be used, and the returned value + // is either the same as iter or a new on-heap object that + // wraps the passed iter. In the latter case the return value points + // to a different object then iter, and the callee has the ownership of the // returned object. - virtual InternalIteratorBase* NewIterator( - IndexBlockIter* iter = nullptr, bool total_order_seek = true, - bool fill_cache = true) = 0; - - // The size of the index. - virtual size_t size() const = 0; - // Memory usage of the index block - virtual size_t usable_size() const = 0; - // return the statistics pointer - virtual Statistics* statistics() const { return statistics_; } + virtual InternalIteratorBase* NewIterator( + const ReadOptions& read_options, bool disable_prefix_seek, + IndexBlockIter* iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; + // Report an approximation of how much memory has been used other than - // memory - // that was allocated in block cache. + // memory that was allocated in block cache. virtual size_t ApproximateMemoryUsage() const = 0; - - virtual void CacheDependencies(bool /* unused */) {} - - // Prefetch all the blocks referenced by this index to the buffer - void PrefetchBlocks(FilePrefetchBuffer* buf); - - protected: - const InternalKeyComparator* icomparator_; - - private: - Statistics* statistics_; + // Cache the dependencies of the index reader (e.g. the partitions + // of a partitioned index). + virtual void CacheDependencies(bool /* pin */) {} }; + class IndexReaderCommon; + static Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size, const BlockHandle& handle, char* cache_key); @@ -220,39 +233,62 @@ class BlockBasedTable : public TableReader { // The key retrieved are internal keys. Status GetKVPairsFromDataBlocks(std::vector* kv_pair_blocks); - template - struct CachableEntry; struct Rep; Rep* get_rep() { return rep_; } + const Rep* get_rep() const { return rep_; } // input_iter: if it is not null, update this one and return it as Iterator template - static TBlockIter* NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const Slice& index_value, - TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, bool index_key_is_full = true, - GetContext* get_context = nullptr, - FilePrefetchBuffer* prefetch_buffer = nullptr); + TBlockIter* NewDataBlockIterator( + const ReadOptions& ro, const BlockHandle& block_handle, + TBlockIter* input_iter, BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, Status s, + FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const; + + // input_iter: if it is not null, update this one and return it as Iterator template - static TBlockIter* NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde, - TBlockIter* input_iter = nullptr, bool is_index = false, - bool key_includes_seq = true, bool index_key_is_full = true, - GetContext* get_context = nullptr, Status s = Status(), - FilePrefetchBuffer* prefetch_buffer = nullptr); + TBlockIter* NewDataBlockIterator(const ReadOptions& ro, + CachableEntry& block, + TBlockIter* input_iter, Status s) const; class PartitionedIndexIteratorState; + template + friend class FilterBlockReaderCommon; + friend class PartitionIndexReader; + friend class UncompressionDictReader; + protected: Rep* rep_; - explicit BlockBasedTable(Rep* rep) : rep_(rep) {} + explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer) + : rep_(rep), block_cache_tracer_(block_cache_tracer) {} + // No copying allowed + explicit BlockBasedTable(const TableReader&) = delete; + void operator=(const TableReader&) = delete; private: friend class MockedBlockBasedTable; static std::atomic next_cache_key_id_; + BlockCacheTracer* const block_cache_tracer_; + + void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context, + size_t usage) const; + void UpdateCacheMissMetrics(BlockType block_type, + GetContext* get_context) const; + void UpdateCacheInsertionMetrics(BlockType block_type, + GetContext* get_context, size_t usage) const; + Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, + BlockType block_type, + GetContext* get_context) const; + + // Either Block::NewDataIterator() or Block::NewIndexIterator(). + template + static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, + TBlockIter* input_iter, + bool block_contents_pinned); // If block cache enabled (compressed or uncompressed), looks for the block // identified by handle in (1) uncompressed cache, (2) compressed cache, and @@ -263,31 +299,40 @@ class BlockBasedTable : public TableReader { // @param block_entry value is set to the uncompressed block if found. If // in uncompressed block cache, also sets cache_handle to reference that // block. - static Status MaybeReadBlockAndLoadToCache( - FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro, + template + Status MaybeReadBlockAndLoadToCache( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index = false, - GetContext* get_context = nullptr); - - // For the following two functions: - // if `no_io == true`, we will not try to read filter/index from sst file - // were they not present in cache yet. - CachableEntry GetFilter( - const SliceTransform* prefix_extractor = nullptr, - FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false, - GetContext* get_context = nullptr) const; - virtual CachableEntry GetFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, - const bool is_a_filter_partition, bool no_io, GetContext* get_context, - const SliceTransform* prefix_extractor = nullptr) const; - - static CachableEntry GetUncompressionDict( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io, - GetContext* get_context); + CachableEntry* block_entry, BlockType block_type, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents) const; + + // Similar to the above, with one crucial difference: it will retrieve the + // block from the file even if there are no caches configured (assuming the + // read options allow I/O). + template + Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& ro, const BlockHandle& handle, + const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, + BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + bool for_compaction, bool use_cache) const; + + void RetrieveMultipleBlocks( + const ReadOptions& options, const MultiGetRange* batch, + const autovector* handles, + autovector* statuses, + autovector, MultiGetContext::MAX_BATCH_SIZE>* + results, + char* scratch, const UncompressionDict& uncompression_dict) const; // Get the iterator from the index reader. - // If input_iter is not set, return new Iterator - // If input_iter is set, update it and return it as Iterator + // + // If input_iter is not set, return a new Iterator. + // If input_iter is set, try to update it and return it as Iterator. + // However note that in some cases the returned iterator may be different + // from input_iter. In such case the returned iterator should be freed. // // Note: ErrorIterator with Status::Incomplete shall be returned if all the // following conditions are met: @@ -295,11 +340,10 @@ class BlockBasedTable : public TableReader { // 2. index is not present in block cache. // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier - InternalIteratorBase* NewIndexIterator( - const ReadOptions& read_options, bool need_upper_bound_check = false, - IndexBlockIter* input_iter = nullptr, - CachableEntry* index_entry = nullptr, - GetContext* get_context = nullptr); + InternalIteratorBase* NewIndexIterator( + const ReadOptions& read_options, bool need_upper_bound_check, + IndexBlockIter* input_iter, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; // Read block cache from block caches (if set): block_cache and // block_cache_compressed. @@ -307,14 +351,13 @@ class BlockBasedTable : public TableReader { // pointer to the block as well as its block handle. // @param uncompression_dict Data for presetting the compression library's // dictionary. - static Status GetDataBlockFromCache( + template + Status GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, Rep* rep, - const ReadOptions& read_options, - BlockBasedTable::CachableEntry* block, - const UncompressionDict& uncompression_dict, - size_t read_amp_bytes_per_bit, bool is_index = false, - GetContext* get_context = nullptr); + Cache* block_cache, Cache* block_cache_compressed, + const ReadOptions& read_options, CachableEntry* block, + const UncompressionDict& uncompression_dict, BlockType block_type, + GetContext* get_context) const; // Put a raw block (maybe compressed) to the corresponding block caches. // This method will perform decompression against raw_block if needed and then @@ -326,16 +369,15 @@ class BlockBasedTable : public TableReader { // PutDataBlockToCache(). After the call, the object will be invalid. // @param uncompression_dict Data for presetting the compression library's // dictionary. - static Status PutDataBlockToCache( + template + Status PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& read_options, const ImmutableCFOptions& ioptions, - CachableEntry* block, BlockContents* raw_block_contents, - CompressionType raw_block_comp_type, uint32_t format_version, + CachableEntry* cached_block, + BlockContents* raw_block_contents, CompressionType raw_block_comp_type, const UncompressionDict& uncompression_dict, SequenceNumber seq_no, - size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, - bool is_index = false, Cache::Priority pri = Cache::Priority::LOW, - GetContext* get_context = nullptr); + MemoryAllocator* memory_allocator, BlockType block_type, + GetContext* get_context) const; // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // after a call to Seek(key), until handle_result returns false. @@ -343,69 +385,65 @@ class BlockBasedTable : public TableReader { friend class TableCache; friend class BlockBasedTableBuilder; - void ReadMeta(const Footer& footer); - - // Figure the index type, update it in rep_, and also return it. - BlockBasedTableOptions::IndexType UpdateIndexType(); - // Create a index reader based on the index type stored in the table. // Optionally, user can pass a preloaded meta_index_iter for the index that // need to access extra meta blocks for index construction. This parameter // helps avoid re-reading meta index block if caller already created one. - Status CreateIndexReader( - FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, - InternalIterator* preloaded_meta_index_iter = nullptr, - const int level = -1); - - bool FullFilterKeyMayMatch( - const ReadOptions& read_options, FilterBlockReader* filter, - const Slice& user_key, const bool no_io, - const SliceTransform* prefix_extractor = nullptr) const; - - void FullFilterKeysMayMatch( - const ReadOptions& read_options, FilterBlockReader* filter, - MultiGetRange* range, const bool no_io, - const SliceTransform* prefix_extractor = nullptr) const; + Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* preloaded_meta_index_iter, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* index_reader); + + bool FullFilterKeyMayMatch(const ReadOptions& read_options, + FilterBlockReader* filter, const Slice& user_key, + const bool no_io, + const SliceTransform* prefix_extractor, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; + + void FullFilterKeysMayMatch(const ReadOptions& read_options, + FilterBlockReader* filter, MultiGetRange* range, + const bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const; static Status PrefetchTail( RandomAccessFileReader* file, uint64_t file_size, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, std::unique_ptr* prefetch_buffer); - static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* meta_block, - std::unique_ptr* iter); - static Status TryReadPropertiesWithGlobalSeqno( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, - TableProperties** table_properties); - static Status ReadPropertiesBlock(Rep* rep, - FilePrefetchBuffer* prefetch_buffer, - InternalIterator* meta_iter, - const SequenceNumber largest_seqno); - static Status ReadRangeDelBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, - InternalIterator* meta_iter, - const InternalKeyComparator& internal_comparator); - static Status ReadCompressionDictBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* compression_dict_block); - static Status PrefetchIndexAndFilterBlocks( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, - InternalIterator* meta_iter, BlockBasedTable* new_table, - const SliceTransform* prefix_extractor, bool prefetch_all, + Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer, + std::unique_ptr* metaindex_block, + std::unique_ptr* iter); + Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer, + const Slice& handle_value, + TableProperties** table_properties); + Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const SequenceNumber largest_seqno); + Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer, + InternalIterator* meta_iter, + const InternalKeyComparator& internal_comparator, + BlockCacheLookupContext* lookup_context); + Status PrefetchIndexAndFilterBlocks( + FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, + BlockBasedTable* new_table, bool prefetch_all, const BlockBasedTableOptions& table_options, const int level, - const bool prefetch_index_and_filter_in_cache); + BlockCacheLookupContext* lookup_context); + + static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); - Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInBlocks(const ReadOptions& read_options, + InternalIteratorBase* index_iter); // Create the filter from the filter block. - virtual FilterBlockReader* ReadFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, - const bool is_a_filter_partition, - const SliceTransform* prefix_extractor = nullptr) const; + std::unique_ptr CreateFilterBlockReader( + FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch, + bool pin, BlockCacheLookupContext* lookup_context); - static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size); + static void SetupCacheKeyPrefix(Rep* rep); // Generate a cache key prefix from the file static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file, @@ -413,62 +451,43 @@ class BlockBasedTable : public TableReader { static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer, size_t* size); + // Given an iterator return its offset in file. + uint64_t ApproximateOffsetOf( + const InternalIteratorBase& index_iter) const; + // Helper functions for DumpTable() Status DumpIndexBlock(WritableFile* out_file); Status DumpDataBlocks(WritableFile* out_file); void DumpKeyValue(const Slice& key, const Slice& value, WritableFile* out_file); - // No copying allowed - explicit BlockBasedTable(const TableReader&) = delete; - void operator=(const TableReader&) = delete; + // A cumulative data block file read in MultiGet lower than this size will + // use a stack buffer + static constexpr size_t kMultiGetReadStackBufSize = 8192; friend class PartitionedFilterBlockReader; friend class PartitionedFilterBlockTest; + friend class DBBasicTest_MultiGetIOBufferOverrun_Test; }; -// Maitaning state of a two-level iteration on a partitioned index structure +// Maitaning state of a two-level iteration on a partitioned index structure. class BlockBasedTable::PartitionedIndexIteratorState : public TwoLevelIteratorState { public: PartitionedIndexIteratorState( - BlockBasedTable* table, - std::unordered_map>* block_map, - const bool index_key_includes_seq, const bool index_key_is_full); - InternalIteratorBase* NewSecondaryIterator( + const BlockBasedTable* table, + std::unordered_map>* block_map); + InternalIteratorBase* NewSecondaryIterator( const BlockHandle& index_value) override; private: // Don't own table_ - BlockBasedTable* table_; + const BlockBasedTable* table_; std::unordered_map>* block_map_; - bool index_key_includes_seq_; - bool index_key_is_full_; -}; - -// CachableEntry represents the entries that *may* be fetched from block cache. -// field `value` is the item we want to get. -// field `cache_handle` is the cache handle to the block cache. If the value -// was not read from cache, `cache_handle` will be nullptr. -template -struct BlockBasedTable::CachableEntry { - CachableEntry(TValue* _value, Cache::Handle* _cache_handle) - : value(_value), cache_handle(_cache_handle) {} - CachableEntry() : CachableEntry(nullptr, nullptr) {} - void Release(Cache* cache, bool force_erase = false) { - if (cache_handle) { - cache->Release(cache_handle, force_erase); - value = nullptr; - cache_handle = nullptr; - } - } - bool IsSet() const { return cache_handle != nullptr; } - - TValue* value = nullptr; - // if the entry is from the cache, cache_handle will be populated. - Cache::Handle* cache_handle = nullptr; }; +// Stores all the properties associated with a BlockBasedTable. +// These are immutable. struct BlockBasedTable::Rep { Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, const BlockBasedTableOptions& _table_opt, @@ -501,21 +520,14 @@ struct BlockBasedTable::Rep { size_t persistent_cache_key_prefix_size = 0; char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size = 0; - uint64_t dummy_index_reader_offset = - 0; // ID that is unique for the block cache. PersistentCacheOptions persistent_cache_options; // Footer contains the fixed table information Footer footer; - // `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e., - // non-nullptr) and used only when options.block_cache is nullptr or when - // `cache_index_and_filter_blocks == false`. Otherwise, we will get the index, - // filter, and compression dictionary blocks via the block cache. In that case - // `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle` - // are used to lookup these meta-blocks in block cache. + std::unique_ptr index_reader; std::unique_ptr filter; - std::unique_ptr uncompression_dict; + std::unique_ptr uncompression_dict_reader; enum class FilterType { kNoFilter, @@ -539,14 +551,6 @@ struct BlockBasedTable::Rep { std::unique_ptr internal_prefix_transform; std::shared_ptr table_prefix_extractor; - // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is - // true or in all levels when pin_top_level_index_and_filter is set in - // combination with partitioned index/filters: then we do use the LRU cache, - // but we always keep the filter & index block's handle checked out here (=we - // don't call Release()), plus the parsed out objects the LRU cache will never - // push flush them out, hence they're pinned - CachableEntry filter_entry; - CachableEntry index_entry; std::shared_ptr fragmented_range_dels; // If global_seqno is used, all Keys in this file will have the same @@ -570,26 +574,52 @@ struct BlockBasedTable::Rep { // still work, just not as quickly. bool blocks_definitely_zstd_compressed = false; - bool closed = false; + // These describe how index is encoded. + bool index_has_first_key = false; + bool index_key_includes_seq = true; + bool index_value_is_full = true; + const bool immortal_table; - SequenceNumber get_global_seqno(bool is_index) const { - return is_index ? kDisableGlobalSequenceNumber : global_seqno; + SequenceNumber get_global_seqno(BlockType block_type) const { + return (block_type == BlockType::kFilter || + block_type == BlockType::kCompressionDictionary) + ? kDisableGlobalSequenceNumber + : global_seqno; + } + + uint64_t cf_id_for_tracing() const { + return table_properties ? table_properties->column_family_id + : rocksdb::TablePropertiesCollectorFactory:: + Context::kUnknownColumnFamily; + } + + Slice cf_name_for_tracing() const { + return table_properties ? table_properties->column_family_name + : BlockCacheTraceHelper::kUnknownColumnFamilyName; + } + + uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; } + + uint64_t sst_number_for_tracing() const { + return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX; } }; +// Iterates over the contents of BlockBasedTable. template class BlockBasedTableIterator : public InternalIteratorBase { + // compaction_readahead_size: its value will only be used if for_compaction = + // true public: - BlockBasedTableIterator(BlockBasedTable* table, + BlockBasedTableIterator(const BlockBasedTable* table, const ReadOptions& read_options, const InternalKeyComparator& icomp, - InternalIteratorBase* index_iter, + InternalIteratorBase* index_iter, bool check_filter, bool need_upper_bound_check, - const SliceTransform* prefix_extractor, bool is_index, - bool key_includes_seq = true, - bool index_key_is_full = true, - bool for_compaction = false) + const SliceTransform* prefix_extractor, + BlockType block_type, TableReaderCaller caller, + size_t compaction_readahead_size = 0) : table_(table), read_options_(read_options), icomp_(icomp), @@ -600,10 +630,9 @@ class BlockBasedTableIterator : public InternalIteratorBase { check_filter_(check_filter), need_upper_bound_check_(need_upper_bound_check), prefix_extractor_(prefix_extractor), - is_index_(is_index), - key_includes_seq_(key_includes_seq), - index_key_is_full_(index_key_is_full), - for_compaction_(for_compaction) {} + block_type_(block_type), + lookup_context_(caller), + compaction_readahead_size_(compaction_readahead_size) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -612,22 +641,41 @@ class BlockBasedTableIterator : public InternalIteratorBase { void SeekToFirst() override; void SeekToLast() override; void Next() final override; - bool NextAndGetResult(Slice* ret_key) override; + bool NextAndGetResult(IterateResult* result) override; void Prev() override; bool Valid() const override { - return !is_out_of_bound_ && block_iter_points_to_real_block_ && - block_iter_.Valid(); + return !is_out_of_bound_ && + (is_at_first_key_from_index_ || + (block_iter_points_to_real_block_ && block_iter_.Valid())); } Slice key() const override { assert(Valid()); - return block_iter_.key(); + if (is_at_first_key_from_index_) { + return index_iter_->value().first_internal_key; + } else { + return block_iter_.key(); + } } Slice user_key() const override { assert(Valid()); - return block_iter_.user_key(); + if (is_at_first_key_from_index_) { + return ExtractUserKey(index_iter_->value().first_internal_key); + } else { + return block_iter_.user_key(); + } } TValue value() const override { assert(Valid()); + + // Load current block if not loaded. + if (is_at_first_key_from_index_ && + !const_cast(this) + ->MaterializeCurrentBlock()) { + // Oops, index is not consistent with block contents, but we have + // no good way to report error at this point. Let's return empty value. + return TValue(); + } + return block_iter_.value(); } Status status() const override { @@ -643,14 +691,26 @@ class BlockBasedTableIterator : public InternalIteratorBase { // Whether iterator invalidated for being out of bound. bool IsOutOfBound() override { return is_out_of_bound_; } + inline bool MayBeOutOfUpperBound() override { + assert(Valid()); + return !data_block_within_upper_bound_; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } bool IsKeyPinned() const override { + // Our key comes either from block_iter_'s current key + // or index_iter_'s current *value*. return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && - block_iter_points_to_real_block_ && block_iter_.IsKeyPinned(); + ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) || + (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned())); } bool IsValuePinned() const override { + // Load current block if not loaded. + if (is_at_first_key_from_index_) { + const_cast(this)->MaterializeCurrentBlock(); + } // BlockIter::IsValuePinned() is always true. No need to check return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() && block_iter_points_to_real_block_; @@ -659,7 +719,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool CheckPrefixMayMatch(const Slice& ikey) { if (check_filter_ && !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, - need_upper_bound_check_)) { + need_upper_bound_check_, &lookup_context_)) { // TODO remember the iterator is invalidated because of prefix // match. This can avoid the upper level file iterator to falsely // believe the position is the end of the SST file and move to @@ -684,49 +744,60 @@ class BlockBasedTableIterator : public InternalIteratorBase { if (block_iter_points_to_real_block_) { // Reseek. If they end up with the same data block, we shouldn't re-fetch // the same data block. - prev_index_value_ = index_iter_->value(); + prev_block_offset_ = index_iter_->value().handle.offset(); } } - void InitDataBlock(); - inline void FindKeyForward(); - void FindBlockForward(); - void FindKeyBackward(); - void CheckOutOfBound(); - private: - BlockBasedTable* table_; + const BlockBasedTable* table_; const ReadOptions read_options_; const InternalKeyComparator& icomp_; UserComparatorWrapper user_comparator_; - InternalIteratorBase* index_iter_; + InternalIteratorBase* index_iter_; PinnedIteratorsManager* pinned_iters_mgr_; TBlockIter block_iter_; + + // True if block_iter_ is initialized and points to the same block + // as index iterator. bool block_iter_points_to_real_block_; + // See InternalIteratorBase::IsOutOfBound(). bool is_out_of_bound_ = false; + // Whether current data block being fully within iterate upper bound. + bool data_block_within_upper_bound_ = false; + // True if we're standing at the first key of a block, and we haven't loaded + // that block yet. A call to value() will trigger loading the block. + bool is_at_first_key_from_index_ = false; bool check_filter_; // TODO(Zhongyi): pick a better name bool need_upper_bound_check_; const SliceTransform* prefix_extractor_; - // If the blocks over which we iterate are index blocks - bool is_index_; - // If the keys in the blocks over which we iterate include 8 byte sequence - bool key_includes_seq_; - bool index_key_is_full_; - // If this iterator is created for compaction - bool for_compaction_; - BlockHandle prev_index_value_; - - // All the below fields control iterator readahead - static const size_t kInitAutoReadaheadSize = 8 * 1024; - // Found that 256 KB readahead size provides the best performance, based on - // experiments, for auto readahead. Experiment data is in PR #3282. - static const size_t kMaxAutoReadaheadSize; - static const int kMinNumFileReadsToStartAutoReadahead = 2; - size_t readahead_size_ = kInitAutoReadaheadSize; + BlockType block_type_; + uint64_t prev_block_offset_ = std::numeric_limits::max(); + BlockCacheLookupContext lookup_context_; + // Readahead size used in compaction, its value is used only if + // lookup_context_.caller = kCompaction. + size_t compaction_readahead_size_; + + size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; size_t readahead_limit_ = 0; int64_t num_file_reads_ = 0; std::unique_ptr prefetch_buffer_; + + // If `target` is null, seek to first. + void SeekImpl(const Slice* target); + + void InitDataBlock(); + bool MaterializeCurrentBlock(); + void FindKeyForward(); + void FindBlockForward(); + void FindKeyBackward(); + void CheckOutOfBound(); + + // Check if data block is fully within iterate_upper_bound. + // + // Note MyRocks may update iterate bounds between seek. To workaround it, + // we need to check and update data_block_within_upper_bound_ accordingly. + void CheckDataBlockWithinUpperBound(); }; } // namespace rocksdb diff --git a/table/block_builder.cc b/table/block_based/block_builder.cc similarity index 98% rename from table/block_builder.cc rename to table/block_based/block_builder.cc index c14b4f6d3ee..a6a240c8e0a 100644 --- a/table/block_builder.cc +++ b/table/block_based/block_builder.cc @@ -31,13 +31,13 @@ // num_restarts: uint32 // restarts[i] contains the offset within the block of the ith restart point. -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" #include #include #include "db/dbformat.h" #include "rocksdb/comparator.h" -#include "table/data_block_footer.h" +#include "table/block_based/data_block_footer.h" #include "util/coding.h" namespace rocksdb { diff --git a/table/block_builder.h b/table/block_based/block_builder.h similarity index 98% rename from table/block_builder.h rename to table/block_based/block_builder.h index 0576279f501..153e57569a2 100644 --- a/table/block_builder.h +++ b/table/block_based/block_builder.h @@ -13,7 +13,7 @@ #include #include "rocksdb/slice.h" #include "rocksdb/table.h" -#include "table/data_block_hash_index.h" +#include "table/block_based/data_block_hash_index.h" namespace rocksdb { diff --git a/table/block_prefix_index.cc b/table/block_based/block_prefix_index.cc similarity index 99% rename from table/block_prefix_index.cc rename to table/block_based/block_prefix_index.cc index 67c749d4c3a..6e24f17cf68 100644 --- a/table/block_prefix_index.cc +++ b/table/block_based/block_prefix_index.cc @@ -3,14 +3,14 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "table/block_prefix_index.h" +#include "table/block_based/block_prefix_index.h" #include +#include "memory/arena.h" #include "rocksdb/comparator.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "util/arena.h" #include "util/coding.h" #include "util/hash.h" diff --git a/table/block_prefix_index.h b/table/block_based/block_prefix_index.h similarity index 100% rename from table/block_prefix_index.h rename to table/block_based/block_prefix_index.h diff --git a/table/block_test.cc b/table/block_based/block_test.cc similarity index 78% rename from table/block_test.cc rename to table/block_based/block_test.cc index 3e0ff3eab59..38fa55089f1 100644 --- a/table/block_test.cc +++ b/table/block_based/block_test.cc @@ -3,6 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // + #include #include #include @@ -19,12 +20,12 @@ #include "rocksdb/iterator.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" -#include "table/block.h" -#include "table/block_builder.h" +#include "table/block_based/block.h" +#include "table/block_based/block_builder.h" #include "table/format.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { @@ -68,37 +69,12 @@ void GenerateRandomKVs(std::vector *keys, } } -// Same as GenerateRandomKVs but the values are BlockHandle -void GenerateRandomKBHs(std::vector *keys, - std::vector *values, const int from, - const int len, const int step = 1, - const int padding_size = 0, - const int keys_share_prefix = 1) { - Random rnd(302); - uint64_t offset = 0; - - // generate different prefix - for (int i = from; i < from + len; i += step) { - // generate keys that shares the prefix - for (int j = 0; j < keys_share_prefix; ++j) { - keys->emplace_back(GenerateKey(i, j, padding_size, &rnd)); - - uint64_t size = rnd.Uniform(1024 * 16); - BlockHandle handle(offset, size); - offset += size + kBlockTrailerSize; - values->emplace_back(handle); - } - } -} - class BlockTest : public testing::Test {}; // block test TEST_F(BlockTest, SimpleTest) { Random rnd(301); Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); std::vector keys; std::vector values; @@ -122,7 +98,7 @@ TEST_F(BlockTest, SimpleTest) { // read contents of block sequentially int count = 0; InternalIterator *iter = - reader.NewIterator(options.comparator, options.comparator); + reader.NewDataIterator(options.comparator, options.comparator); for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { // read kv from block Slice k = iter->key(); @@ -135,8 +111,7 @@ TEST_F(BlockTest, SimpleTest) { delete iter; // read block contents randomly - iter = - reader.NewIterator(options.comparator, options.comparator); + iter = reader.NewDataIterator(options.comparator, options.comparator); for (int i = 0; i < num_records; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -151,83 +126,6 @@ TEST_F(BlockTest, SimpleTest) { delete iter; } -TEST_F(BlockTest, ValueDeltaEncodingTest) { - Random rnd(301); - Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); - - std::vector keys; - std::vector values; - const bool kUseDeltaEncoding = true; - const bool kUseValueDeltaEncoding = true; - BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding); - int num_records = 100; - - GenerateRandomKBHs(&keys, &values, 0, num_records); - // add a bunch of records to a block - BlockHandle last_encoded_handle; - for (int i = 0; i < num_records; i++) { - auto block_handle = values[i]; - std::string handle_encoding; - block_handle.EncodeTo(&handle_encoding); - std::string handle_delta_encoding; - PutVarsignedint64(&handle_delta_encoding, - block_handle.size() - last_encoded_handle.size()); - last_encoded_handle = block_handle; - const Slice handle_delta_encoding_slice(handle_delta_encoding); - builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice); - } - - // read serialized contents of the block - Slice rawblock = builder.Finish(); - - // create block reader - BlockContents contents; - contents.data = rawblock; - Block reader(std::move(contents), kDisableGlobalSequenceNumber); - - const bool kTotalOrderSeek = true; - const bool kIncludesSeq = true; - const bool kValueIsFull = !kUseValueDeltaEncoding; - IndexBlockIter *kNullIter = nullptr; - Statistics *kNullStats = nullptr; - // read contents of block sequentially - int count = 0; - InternalIteratorBase *iter = reader.NewIterator( - options.comparator, options.comparator, kNullIter, kNullStats, - kTotalOrderSeek, kIncludesSeq, kValueIsFull); - for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { - // read kv from block - Slice k = iter->key(); - BlockHandle handle = iter->value(); - - // compare with lookaside array - ASSERT_EQ(k.ToString().compare(keys[count]), 0); - - ASSERT_EQ(values[count].offset(), handle.offset()); - ASSERT_EQ(values[count].size(), handle.size()); - } - delete iter; - - // read block contents randomly - iter = reader.NewIterator( - options.comparator, options.comparator, kNullIter, kNullStats, - kTotalOrderSeek, kIncludesSeq, kValueIsFull); - for (int i = 0; i < num_records; i++) { - // find a random key in the lookaside array - int index = rnd.Uniform(num_records); - Slice k(keys[index]); - - // search in block for this key - iter->Seek(k); - ASSERT_TRUE(iter->Valid()); - BlockHandle handle = iter->value(); - ASSERT_EQ(values[index].offset(), handle.offset()); - ASSERT_EQ(values[index].size(), handle.size()); - } - delete iter; -} // return the block contents BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, @@ -260,8 +158,7 @@ void CheckBlockContents(BlockContents contents, const int max_key, NewFixedPrefixTransform(prefix_size)); std::unique_ptr regular_iter( - reader2.NewIterator(BytewiseComparator(), - BytewiseComparator())); + reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator())); // Seek existent keys for (size_t i = 0; i < keys.size(); i++) { @@ -456,8 +353,6 @@ TEST_F(BlockTest, BlockReadAmpBitmap) { TEST_F(BlockTest, BlockWithReadAmpBitmap) { Random rnd(301); Options options = Options(); - std::unique_ptr ic; - ic.reset(new test::PlainInternalKeyComparator(options.comparator)); std::vector keys; std::vector values; @@ -485,9 +380,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { // read contents of block sequentially size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { iter->value(); read_bytes += iter->TEST_CurrentEntrySize(); @@ -518,9 +412,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { kBytesPerBit, stats.get()); size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); for (int i = 0; i < num_records; i++) { Slice k(keys[i]); @@ -554,9 +447,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { kBytesPerBit, stats.get()); size_t read_bytes = 0; - DataBlockIter *iter = - static_cast(reader.NewIterator( - options.comparator, options.comparator, nullptr, stats.get())); + DataBlockIter *iter = reader.NewDataIterator( + options.comparator, options.comparator, nullptr, stats.get()); std::unordered_set read_keys; for (int i = 0; i < num_records; i++) { int index = rnd.Uniform(num_records); @@ -586,21 +478,147 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { TEST_F(BlockTest, ReadAmpBitmapPow2) { std::shared_ptr stats = rocksdb::CreateDBStatistics(); - ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1); - ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2); - ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4); - ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8); - ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16); - ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32); - - ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2); - ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4); - ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8); - ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16); - ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32); - ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32); + ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1u); + ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2u); + ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4u); + ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8u); + ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16u); + ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32u); + + ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2u); + ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4u); + ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8u); + ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16u); + ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32u); + ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u); } +class IndexBlockTest + : public testing::Test, + public testing::WithParamInterface> { + public: + IndexBlockTest() = default; + + bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); } + bool includeFirstKey() const { return std::get<1>(GetParam()); } +}; + +// Similar to GenerateRandomKVs but for index block contents. +void GenerateRandomIndexEntries(std::vector *separators, + std::vector *block_handles, + std::vector *first_keys, + const int len) { + Random rnd(42); + + // For each of `len` blocks, we need to generate a first and last key. + // Let's generate n*2 random keys, sort them, group into consecutive pairs. + std::set keys; + while ((int)keys.size() < len * 2) { + // Keys need to be at least 8 bytes long to look like internal keys. + keys.insert(test::RandomKey(&rnd, 12)); + } + + uint64_t offset = 0; + for (auto it = keys.begin(); it != keys.end();) { + first_keys->emplace_back(*it++); + separators->emplace_back(*it++); + uint64_t size = rnd.Uniform(1024 * 16); + BlockHandle handle(offset, size); + offset += size + kBlockTrailerSize; + block_handles->emplace_back(handle); + } +} + +TEST_P(IndexBlockTest, IndexValueEncodingTest) { + Random rnd(301); + Options options = Options(); + + std::vector separators; + std::vector block_handles; + std::vector first_keys; + const bool kUseDeltaEncoding = true; + BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding()); + int num_records = 100; + + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + num_records); + BlockHandle last_encoded_handle; + for (int i = 0; i < num_records; i++) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr); + if (useValueDeltaEncoding() && i > 0) { + entry.EncodeTo(&delta_encoded_entry, includeFirstKey(), + &last_encoded_handle); + } + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + + // read serialized contents of the block + Slice rawblock = builder.Finish(); + + // create block reader + BlockContents contents; + contents.data = rawblock; + Block reader(std::move(contents), kDisableGlobalSequenceNumber); + + const bool kTotalOrderSeek = true; + const bool kIncludesSeq = true; + const bool kValueIsFull = !useValueDeltaEncoding(); + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + InternalIteratorBase *iter = reader.NewIndexIterator( + options.comparator, options.comparator, kNullIter, kNullStats, + kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); + iter->SeekToFirst(); + for (int index = 0; index < num_records; ++index) { + ASSERT_TRUE(iter->Valid()); + + Slice k = iter->key(); + IndexValue v = iter->value(); + + EXPECT_EQ(separators[index], k.ToString()); + EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); + EXPECT_EQ(block_handles[index].size(), v.handle.size()); + EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", + v.first_internal_key.ToString()); + + iter->Next(); + } + delete iter; + + // read block contents randomly + iter = reader.NewIndexIterator(options.comparator, options.comparator, + kNullIter, kNullStats, kTotalOrderSeek, + includeFirstKey(), kIncludesSeq, kValueIsFull); + for (int i = 0; i < num_records * 2; i++) { + // find a random key in the lookaside array + int index = rnd.Uniform(num_records); + Slice k(separators[index]); + + // search in block for this key + iter->Seek(k); + ASSERT_TRUE(iter->Valid()); + IndexValue v = iter->value(); + EXPECT_EQ(separators[index], iter->key().ToString()); + EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); + EXPECT_EQ(block_handles[index].size(), v.handle.size()); + EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", + v.first_internal_key.ToString()); + } + delete iter; +} + +INSTANTIATE_TEST_CASE_P(P, IndexBlockTest, + ::testing::Values(std::make_tuple(false, false), + std::make_tuple(false, true), + std::make_tuple(true, false), + std::make_tuple(true, true))); + } // namespace rocksdb int main(int argc, char **argv) { diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h new file mode 100644 index 00000000000..a60be2e6a70 --- /dev/null +++ b/table/block_based/block_type.h @@ -0,0 +1,30 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +namespace rocksdb { + +// Represents the types of blocks used in the block based table format. +// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format +// for details. + +enum class BlockType : uint8_t { + kData, + kFilter, + kProperties, + kCompressionDictionary, + kRangeDeletion, + kHashIndexPrefixes, + kHashIndexMetadata, + kMetaIndex, + kIndex, + // Note: keep kInvalid the last value when adding new enum values. + kInvalid +}; + +} // namespace rocksdb diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h new file mode 100644 index 00000000000..b4cd6ec6757 --- /dev/null +++ b/table/block_based/cachable_entry.h @@ -0,0 +1,220 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include "port/likely.h" +#include "rocksdb/cache.h" +#include "rocksdb/cleanable.h" + +namespace rocksdb { + +// CachableEntry is a handle to an object that may or may not be in the block +// cache. It is used in a variety of ways: +// +// 1) It may refer to an object in the block cache. In this case, cache_ and +// cache_handle_ are not nullptr, and the cache handle has to be released when +// the CachableEntry is destroyed (the lifecycle of the cached object, on the +// other hand, is managed by the cache itself). +// 2) It may uniquely own the (non-cached) object it refers to (examples include +// a block read directly from file, or uncompressed blocks when there is a +// compressed block cache but no uncompressed block cache). In such cases, the +// object has to be destroyed when the CachableEntry is destroyed. +// 3) It may point to an object (cached or not) without owning it. In this case, +// no action is needed when the CachableEntry is destroyed. +// 4) Sometimes, management of a cached or owned object (see #1 and #2 above) +// is transferred to some other object. This is used for instance with iterators +// (where cleanup is performed using a chain of cleanup functions, +// see Cleanable). +// +// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not +// allowed); hence, this is a move-only type, where a move transfers the +// management responsibilities, and leaves the source object in an empty state. + +template +class CachableEntry { +public: + CachableEntry() = default; + + CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle, + bool own_value) + : value_(value) + , cache_(cache) + , cache_handle_(cache_handle) + , own_value_(own_value) + { + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + } + + CachableEntry(const CachableEntry&) = delete; + CachableEntry& operator=(const CachableEntry&) = delete; + + CachableEntry(CachableEntry&& rhs) + : value_(rhs.value_) + , cache_(rhs.cache_) + , cache_handle_(rhs.cache_handle_) + , own_value_(rhs.own_value_) + { + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + + rhs.ResetFields(); + } + + CachableEntry& operator=(CachableEntry&& rhs) { + if (UNLIKELY(this == &rhs)) { + return *this; + } + + ReleaseResource(); + + value_ = rhs.value_; + cache_ = rhs.cache_; + cache_handle_ = rhs.cache_handle_; + own_value_ = rhs.own_value_; + + assert(value_ != nullptr || + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + assert(!!cache_ == !!cache_handle_); + assert(!cache_handle_ || !own_value_); + + rhs.ResetFields(); + + return *this; + } + + ~CachableEntry() { + ReleaseResource(); + } + + bool IsEmpty() const { + return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr && + !own_value_; + } + + bool IsCached() const { + assert(!!cache_ == !!cache_handle_); + + return cache_handle_ != nullptr; + } + + T* GetValue() const { return value_; } + Cache* GetCache() const { return cache_; } + Cache::Handle* GetCacheHandle() const { return cache_handle_; } + bool GetOwnValue() const { return own_value_; } + + void Reset() { + ReleaseResource(); + ResetFields(); + } + + void TransferTo(Cleanable* cleanable) { + if (cleanable) { + if (cache_handle_ != nullptr) { + assert(cache_ != nullptr); + cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_); + } else if (own_value_) { + cleanable->RegisterCleanup(&DeleteValue, value_, nullptr); + } + } + + ResetFields(); + } + + void SetOwnedValue(T* value) { + assert(value != nullptr); + + if (UNLIKELY(value_ == value && own_value_)) { + assert(cache_ == nullptr && cache_handle_ == nullptr); + return; + } + + Reset(); + + value_ = value; + own_value_ = true; + } + + void SetUnownedValue(T* value) { + assert(value != nullptr); + + if (UNLIKELY(value_ == value && cache_ == nullptr && + cache_handle_ == nullptr && !own_value_)) { + return; + } + + Reset(); + + value_ = value; + assert(!own_value_); + } + + void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) { + assert(value != nullptr); + assert(cache != nullptr); + assert(cache_handle != nullptr); + + if (UNLIKELY(value_ == value && cache_ == cache && + cache_handle_ == cache_handle && !own_value_)) { + return; + } + + Reset(); + + value_ = value; + cache_ = cache; + cache_handle_ = cache_handle; + assert(!own_value_); + } + +private: + void ReleaseResource() { + if (LIKELY(cache_handle_ != nullptr)) { + assert(cache_ != nullptr); + cache_->Release(cache_handle_); + } else if (own_value_) { + delete value_; + } + } + + void ResetFields() { + value_ = nullptr; + cache_ = nullptr; + cache_handle_ = nullptr; + own_value_ = false; + } + + static void ReleaseCacheHandle(void* arg1, void* arg2) { + Cache* const cache = static_cast(arg1); + assert(cache); + + Cache::Handle* const cache_handle = static_cast(arg2); + assert(cache_handle); + + cache->Release(cache_handle); + } + + static void DeleteValue(void* arg1, void* /* arg2 */) { + delete static_cast(arg1); + } + +private: + T* value_ = nullptr; + Cache* cache_ = nullptr; + Cache::Handle* cache_handle_ = nullptr; + bool own_value_ = false; +}; + +} // namespace rocksdb diff --git a/table/data_block_footer.cc b/table/block_based/data_block_footer.cc similarity index 97% rename from table/data_block_footer.cc rename to table/block_based/data_block_footer.cc index cb9e1438152..2cf31b4c5ef 100644 --- a/table/data_block_footer.cc +++ b/table/block_based/data_block_footer.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "data_block_footer.h" +#include "table/block_based/data_block_footer.h" #include "rocksdb/table.h" diff --git a/table/data_block_footer.h b/table/block_based/data_block_footer.h similarity index 100% rename from table/data_block_footer.h rename to table/block_based/data_block_footer.h diff --git a/table/data_block_hash_index.cc b/table/block_based/data_block_hash_index.cc similarity index 98% rename from table/data_block_hash_index.cc rename to table/block_based/data_block_hash_index.cc index adb1d7b8c26..7737a9491ee 100644 --- a/table/data_block_hash_index.cc +++ b/table/block_based/data_block_hash_index.cc @@ -6,7 +6,7 @@ #include #include "rocksdb/slice.h" -#include "table/data_block_hash_index.h" +#include "table/block_based/data_block_hash_index.h" #include "util/coding.h" #include "util/hash.h" diff --git a/table/data_block_hash_index.h b/table/block_based/data_block_hash_index.h similarity index 100% rename from table/data_block_hash_index.h rename to table/block_based/data_block_hash_index.h diff --git a/table/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc similarity index 97% rename from table/data_block_hash_index_test.cc rename to table/block_based/data_block_hash_index_test.cc index 11226648ef2..ae23f6ef2d3 100644 --- a/table/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -9,14 +9,14 @@ #include "db/table_properties_collector.h" #include "rocksdb/slice.h" -#include "table/block.h" -#include "table/block_based_table_reader.h" -#include "table/block_builder.h" -#include "table/data_block_hash_index.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/data_block_hash_index.h" #include "table/get_context.h" #include "table/table_builder.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { @@ -391,7 +391,7 @@ TEST(DataBlockHashIndex, BlockTestSingleKey) { Block reader(std::move(contents), kDisableGlobalSequenceNumber); const InternalKeyComparator icmp(BytewiseComparator()); - auto iter = reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); bool may_exist; // search in block for the key just inserted { @@ -474,8 +474,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) { // random seek existent keys for (int i = 0; i < num_records; i++) { - auto iter = - reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); // find a random key in the lookaside array int index = rnd.Uniform(num_records); std::string ukey(keys[index] + "1" /* existing key marker */); @@ -512,8 +511,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) { // C true false for (int i = 0; i < num_records; i++) { - auto iter = - reader.NewIterator(&icmp, icmp.user_comparator()); + auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator()); // find a random key in the lookaside array int index = rnd.Uniform(num_records); std::string ukey(keys[index] + "0" /* non-existing key marker */); @@ -633,7 +631,7 @@ TEST(DataBlockHashIndex, BlockBoundary) { InternalKey seek_ikey(seek_ukey, 60, kTypeValue); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, seek_ukey, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); ASSERT_EQ(get_context.State(), GetContext::kFound); @@ -658,7 +656,7 @@ TEST(DataBlockHashIndex, BlockBoundary) { InternalKey seek_ikey(seek_ukey, 60, kTypeValue); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, seek_ukey, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); ASSERT_EQ(get_context.State(), GetContext::kFound); @@ -683,7 +681,7 @@ TEST(DataBlockHashIndex, BlockBoundary) { InternalKey seek_ikey(seek_ukey, 120, kTypeValue); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, seek_ukey, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); ASSERT_EQ(get_context.State(), GetContext::kFound); @@ -708,7 +706,7 @@ TEST(DataBlockHashIndex, BlockBoundary) { InternalKey seek_ikey(seek_ukey, 5, kTypeValue); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, seek_ukey, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options); ASSERT_EQ(get_context.State(), GetContext::kNotFound); diff --git a/table/filter_block.h b/table/block_based/filter_block.h similarity index 67% rename from table/filter_block.h rename to table/block_based/filter_block.h index 8abb88e5f4f..90766b1140b 100644 --- a/table/filter_block.h +++ b/table/block_based/filter_block.h @@ -24,12 +24,13 @@ #include #include #include "db/dbformat.h" -#include "format.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "table/format.h" #include "table/multiget_context.h" +#include "trace_replay/block_cache_tracer.h" #include "util/hash.h" namespace rocksdb { @@ -37,6 +38,7 @@ namespace rocksdb { const uint64_t kNotValid = ULLONG_MAX; class FilterPolicy; +class GetContext; using MultiGetRange = MultiGetContext::Range; // A FilterBlockBuilder is used to construct all of the filters for a @@ -50,6 +52,10 @@ using MultiGetRange = MultiGetContext::Range; class FilterBlockBuilder { public: explicit FilterBlockBuilder() {} + // No copying allowed + FilterBlockBuilder(const FilterBlockBuilder&) = delete; + void operator=(const FilterBlockBuilder&) = delete; + virtual ~FilterBlockBuilder() {} virtual bool IsBlockBased() = 0; // If is blockbased filter @@ -64,11 +70,6 @@ class FilterBlockBuilder { return ret; } virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0; - - private: - // No copying allowed - FilterBlockBuilder(const FilterBlockBuilder&); - void operator=(const FilterBlockBuilder&); }; // A FilterBlockReader is used to parse filter from SST table. @@ -77,16 +78,14 @@ class FilterBlockBuilder { // BlockBased/Full FilterBlock would be called in the same way. class FilterBlockReader { public: - explicit FilterBlockReader() - : whole_key_filtering_(true), size_(0), statistics_(nullptr) {} - explicit FilterBlockReader(size_t s, Statistics* stats, - bool _whole_key_filtering) - : whole_key_filtering_(_whole_key_filtering), - size_(s), - statistics_(stats) {} - virtual ~FilterBlockReader() {} + FilterBlockReader() = default; + virtual ~FilterBlockReader() = default; + + FilterBlockReader(const FilterBlockReader&) = delete; + FilterBlockReader& operator=(const FilterBlockReader&) = delete; virtual bool IsBlockBased() = 0; // If is blockbased filter + /** * If no_io is set, then it returns true if it cannot answer the query without * reading data from disk. This is used in PartitionedFilterBlockReader to @@ -99,18 +98,21 @@ class FilterBlockReader { */ virtual bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) = 0; + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; virtual void KeysMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey = iter->ukey; const Slice ikey = iter->ikey; - if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey)) { + GetContext* const get_context = iter->get_context; + if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey, + get_context, lookup_context)) { range->SkipKey(iter); } } @@ -121,29 +123,29 @@ class FilterBlockReader { */ virtual bool PrefixMayMatch(const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) = 0; + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) = 0; virtual void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey = iter->ukey; const Slice ikey = iter->ikey; - if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor, - block_offset, no_io, &ikey)) { + GetContext* const get_context = iter->get_context; + if (prefix_extractor->InDomain(ukey) && + !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor, + block_offset, no_io, &ikey, get_context, + lookup_context)) { range->SkipKey(iter); } } } virtual size_t ApproximateMemoryUsage() const = 0; - virtual size_t size() const { return size_; } - virtual Statistics* statistics() const { return statistics_; } - - bool whole_key_filtering() const { return whole_key_filtering_; } // convert this object to a human readable form virtual std::string ToString() const { @@ -151,30 +153,22 @@ class FilterBlockReader { return error_msg; } - virtual void CacheDependencies(bool /*pin*/, - const SliceTransform* /*prefix_extractor*/) {} + virtual void CacheDependencies(bool /*pin*/) {} - virtual bool RangeMayExist( - const Slice* /*iterate_upper_bound*/, const Slice& user_key, - const SliceTransform* prefix_extractor, - const Comparator* /*comparator*/, const Slice* const const_ikey_ptr, - bool* filter_checked, bool /*need_upper_bound_check*/) { + virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/, + const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* /*comparator*/, + const Slice* const const_ikey_ptr, + bool* filter_checked, + bool /*need_upper_bound_check*/, + BlockCacheLookupContext* lookup_context) { *filter_checked = true; Slice prefix = prefix_extractor->Transform(user_key); return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, - const_ikey_ptr); + const_ikey_ptr, /* get_context */ nullptr, + lookup_context); } - - protected: - bool whole_key_filtering_; - - private: - // No copying allowed - FilterBlockReader(const FilterBlockReader&); - void operator=(const FilterBlockReader&); - size_t size_; - Statistics* statistics_; - int level_ = -1; }; } // namespace rocksdb diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc new file mode 100644 index 00000000000..49a26882305 --- /dev/null +++ b/table/block_based/filter_block_reader_common.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/filter_block_reader_common.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/parsed_full_filter_block.h" + +namespace rocksdb { + +template +Status FilterBlockReaderCommon::ReadFilterBlock( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) { + PERF_TIMER_GUARD(read_filter_block_nanos); + + assert(table); + assert(filter_block); + assert(filter_block->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + + const Status s = + table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, + UncompressionDict::GetEmptyDict(), filter_block, + BlockType::kFilter, get_context, lookup_context, + /* for_compaction */ false, use_cache); + + return s; +} + +template +const SliceTransform* +FilterBlockReaderCommon::table_prefix_extractor() const { + assert(table_); + + const BlockBasedTable::Rep* const rep = table_->get_rep(); + assert(rep); + + return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr; +} + +template +bool FilterBlockReaderCommon::whole_key_filtering() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->whole_key_filtering; +} + +template +bool FilterBlockReaderCommon::cache_filter_blocks() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->table_options.cache_index_and_filter_blocks; +} + +template +Status FilterBlockReaderCommon::GetOrReadFilterBlock( + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const { + assert(filter_block); + + if (!filter_block_.IsEmpty()) { + filter_block->SetUnownedValue(filter_block_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options, + cache_filter_blocks(), get_context, lookup_context, + filter_block); +} + +template +size_t FilterBlockReaderCommon::ApproximateFilterBlockMemoryUsage() + const { + assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr); + return filter_block_.GetOwnValue() + ? filter_block_.GetValue()->ApproximateMemoryUsage() + : 0; +} + +// Explicitly instantiate templates for both "blocklike" types we use. +// This makes it possible to keep the template definitions in the .cc file. +template class FilterBlockReaderCommon; +template class FilterBlockReaderCommon; +template class FilterBlockReaderCommon; + +} // namespace rocksdb diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h new file mode 100644 index 00000000000..4e691e0d913 --- /dev/null +++ b/table/block_based/filter_block_reader_common.h @@ -0,0 +1,55 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include "table/block_based/cachable_entry.h" +#include "table/block_based/filter_block.h" + +namespace rocksdb { + +class BlockBasedTable; +class FilePrefetchBuffer; + +// Encapsulates common functionality for the various filter block reader +// implementations. Provides access to the filter block regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +template +class FilterBlockReaderCommon : public FilterBlockReader { + public: + FilterBlockReaderCommon(const BlockBasedTable* t, + CachableEntry&& filter_block) + : table_(t), filter_block_(std::move(filter_block)) { + assert(table_); + } + + protected: + static Status ReadFilterBlock(const BlockBasedTable* table, + FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block); + + const BlockBasedTable* table() const { return table_; } + const SliceTransform* table_prefix_extractor() const; + bool whole_key_filtering() const; + bool cache_filter_blocks() const; + + Status GetOrReadFilterBlock(bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const; + + size_t ApproximateFilterBlockMemoryUsage() const; + + private: + const BlockBasedTable* table_; + CachableEntry filter_block_; +}; + +} // namespace rocksdb diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc new file mode 100644 index 00000000000..38a6e9f83b2 --- /dev/null +++ b/table/block_based/filter_policy.cc @@ -0,0 +1,697 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "rocksdb/filter_policy.h" + +#include "rocksdb/slice.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/filter_policy_internal.h" +#include "third-party/folly/folly/ConstexprMath.h" +#include "util/bloom_impl.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { + +// See description in FastLocalBloomImpl +class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit FastLocalBloomBitsBuilder(const int millibits_per_key) + : millibits_per_key_(millibits_per_key), + num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) { + assert(millibits_per_key >= 1000); + } + + // No Copy allowed + FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete; + void operator=(const FastLocalBloomBitsBuilder&) = delete; + + ~FastLocalBloomBitsBuilder() override {} + + virtual void AddKey(const Slice& key) override { + uint64_t hash = GetSliceHash64(key); + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } + } + + virtual Slice Finish(std::unique_ptr* buf) override { + uint32_t len_with_metadata = + CalculateSpace(static_cast(hash_entries_.size())); + char* data = new char[len_with_metadata]; + memset(data, 0, len_with_metadata); + + assert(data); + assert(len_with_metadata >= 5); + + uint32_t len = len_with_metadata - 5; + if (len > 0) { + AddAllEntries(data, len); + } + + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -1 = Marker for newer Bloom implementations + data[len] = static_cast(-1); + // 0 = Marker for this sub-implementation + data[len + 1] = static_cast(0); + // num_probes (and 0 in upper bits for 64-byte block size) + data[len + 2] = static_cast(num_probes_); + // rest of metadata stays zero + + const char* const_data = data; + buf->reset(const_data); + hash_entries_.clear(); + + return Slice(data, len_with_metadata); + } + + int CalculateNumEntry(const uint32_t bytes) override { + uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0; + return static_cast(uint64_t{8000} * bytes_no_meta / + millibits_per_key_); + } + + uint32_t CalculateSpace(const int num_entry) override { + uint32_t num_cache_lines = 0; + if (millibits_per_key_ > 0 && num_entry > 0) { + num_cache_lines = static_cast( + (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000); + } + return num_cache_lines * 64 + /*metadata*/ 5; + } + + private: + void AddAllEntries(char* data, uint32_t len) const { + // Simple version without prefetching: + // + // for (auto h : hash_entries_) { + // FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len, + // num_probes_, data); + // } + + const size_t num_entries = hash_entries_.size(); + constexpr size_t kBufferMask = 7; + static_assert(((kBufferMask + 1) & kBufferMask) == 0, + "Must be power of 2 minus 1"); + + std::array hashes; + std::array byte_offsets; + + // Prime the buffer + size_t i = 0; + for (; i <= kBufferMask && i < num_entries; ++i) { + uint64_t h = hash_entries_[i]; + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, + /*out*/ &byte_offsets[i]); + hashes[i] = Upper32of64(h); + } + + // Process and buffer + for (; i < num_entries; ++i) { + uint32_t& hash_ref = hashes[i & kBufferMask]; + uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask]; + // Process (add) + FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_, + data + byte_offset_ref); + // And buffer + uint64_t h = hash_entries_[i]; + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data, + /*out*/ &byte_offset_ref); + hash_ref = Upper32of64(h); + } + + // Finish processing + for (i = 0; i <= kBufferMask && i < num_entries; ++i) { + FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_, + data + byte_offsets[i]); + } + } + + int millibits_per_key_; + int num_probes_; + std::vector hash_entries_; +}; + +// See description in FastLocalBloomImpl +class FastLocalBloomBitsReader : public FilterBitsReader { + public: + FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes) + : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {} + + // No Copy allowed + FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete; + void operator=(const FastLocalBloomBitsReader&) = delete; + + ~FastLocalBloomBitsReader() override {} + + bool MayMatch(const Slice& key) override { + uint64_t h = GetSliceHash64(key); + uint32_t byte_offset; + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_, + /*out*/ &byte_offset); + return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_, + data_ + byte_offset); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + std::array hashes; + std::array byte_offsets; + for (int i = 0; i < num_keys; ++i) { + uint64_t h = GetSliceHash64(*keys[i]); + FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_, + /*out*/ &byte_offsets[i]); + hashes[i] = Upper32of64(h); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared( + hashes[i], num_probes_, data_ + byte_offsets[i]); + } + } + + private: + const char* data_; + const int num_probes_; + const uint32_t len_bytes_; +}; + +using LegacyBloomImpl = LegacyLocalityBloomImpl; + +class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder { + public: + explicit LegacyBloomBitsBuilder(const int bits_per_key); + + // No Copy allowed + LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete; + void operator=(const LegacyBloomBitsBuilder&) = delete; + + ~LegacyBloomBitsBuilder() override; + + void AddKey(const Slice& key) override; + + Slice Finish(std::unique_ptr* buf) override; + + int CalculateNumEntry(const uint32_t bytes) override; + + uint32_t CalculateSpace(const int num_entry) override { + uint32_t dont_care1; + uint32_t dont_care2; + return CalculateSpace(num_entry, &dont_care1, &dont_care2); + } + + private: + int bits_per_key_; + int num_probes_; + std::vector hash_entries_; + + // Get totalbits that optimized for cpu cache line + uint32_t GetTotalBitsForLocality(uint32_t total_bits); + + // Reserve space for new filter + char* ReserveSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Implementation-specific variant of public CalculateSpace + uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Assuming single threaded access to this function. + void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); +}; + +LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key) + : bits_per_key_(bits_per_key), + num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)) { + assert(bits_per_key_); +} + +LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {} + +void LegacyBloomBitsBuilder::AddKey(const Slice& key) { + uint32_t hash = BloomHash(key); + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } +} + +Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr* buf) { + uint32_t total_bits, num_lines; + char* data = ReserveSpace(static_cast(hash_entries_.size()), &total_bits, + &num_lines); + assert(data); + + if (total_bits != 0 && num_lines != 0) { + for (auto h : hash_entries_) { + AddHash(h, data, num_lines, total_bits); + } + } + // See BloomFilterPolicy::GetFilterBitsReader for metadata + data[total_bits / 8] = static_cast(num_probes_); + EncodeFixed32(data + total_bits / 8 + 1, static_cast(num_lines)); + + const char* const_data = data; + buf->reset(const_data); + hash_entries_.clear(); + + return Slice(data, total_bits / 8 + 5); +} + +uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_lines = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_lines an odd number to make sure more bits are involved + // when determining which block. + if (num_lines % 2 == 0) { + num_lines++; + } + return num_lines * (CACHE_LINE_SIZE * 8); +} + +uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry, + uint32_t* total_bits, + uint32_t* num_lines) { + assert(bits_per_key_); + if (num_entry != 0) { + uint32_t total_bits_tmp = static_cast(num_entry * bits_per_key_); + + *total_bits = GetTotalBitsForLocality(total_bits_tmp); + *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); + assert(*total_bits > 0 && *total_bits % 8 == 0); + } else { + // filter is empty, just leave space for metadata + *total_bits = 0; + *num_lines = 0; + } + + // Reserve space for Filter + uint32_t sz = *total_bits / 8; + sz += 5; // 4 bytes for num_lines, 1 byte for num_probes + return sz; +} + +char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry, + uint32_t* total_bits, + uint32_t* num_lines) { + uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines); + char* data = new char[sz]; + memset(data, 0, sz); + return data; +} + +int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) { + assert(bits_per_key_); + assert(bytes > 0); + int high = static_cast(bytes * 8 / bits_per_key_ + 1); + int low = 1; + int n = high; + for (; n >= low; n--) { + if (CalculateSpace(n) <= bytes) { + break; + } + } + assert(n < high); // High should be an overestimation + return n; +} + +inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data, + uint32_t num_lines, + uint32_t total_bits) { +#ifdef NDEBUG + static_cast(total_bits); +#endif + assert(num_lines > 0 && total_bits > 0); + + LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data, + folly::constexpr_log2(CACHE_LINE_SIZE)); +} + +class LegacyBloomBitsReader : public FilterBitsReader { + public: + LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines, + uint32_t log2_cache_line_size) + : data_(data), + num_probes_(num_probes), + num_lines_(num_lines), + log2_cache_line_size_(log2_cache_line_size) {} + + // No Copy allowed + LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete; + void operator=(const LegacyBloomBitsReader&) = delete; + + ~LegacyBloomBitsReader() override {} + + // "contents" contains the data built by a preceding call to + // FilterBitsBuilder::Finish. MayMatch must return true if the key was + // passed to FilterBitsBuilder::AddKey. This method may return true or false + // if the key was not on the list, but it should aim to return false with a + // high probability. + bool MayMatch(const Slice& key) override { + uint32_t hash = BloomHash(key); + uint32_t byte_offset; + LegacyBloomImpl::PrepareHashMayMatch( + hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_); + return LegacyBloomImpl::HashMayMatchPrepared( + hash, num_probes_, data_ + byte_offset, log2_cache_line_size_); + } + + virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override { + std::array hashes; + std::array byte_offsets; + for (int i = 0; i < num_keys; ++i) { + hashes[i] = BloomHash(*keys[i]); + LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_, + /*out*/ &byte_offsets[i], + log2_cache_line_size_); + } + for (int i = 0; i < num_keys; ++i) { + may_match[i] = LegacyBloomImpl::HashMayMatchPrepared( + hashes[i], num_probes_, data_ + byte_offsets[i], + log2_cache_line_size_); + } + } + + private: + const char* data_; + const int num_probes_; + const uint32_t num_lines_; + const uint32_t log2_cache_line_size_; +}; + +class AlwaysTrueFilter : public FilterBitsReader { + public: + bool MayMatch(const Slice&) override { return true; } + using FilterBitsReader::MayMatch; // inherit overload +}; + +class AlwaysFalseFilter : public FilterBitsReader { + public: + bool MayMatch(const Slice&) override { return false; } + using FilterBitsReader::MayMatch; // inherit overload +}; + +} // namespace + +const std::vector BloomFilterPolicy::kAllFixedImpls = { + kLegacyBloom, + kDeprecatedBlock, + kFastLocalBloom, +}; + +const std::vector BloomFilterPolicy::kAllUserModes = { + kDeprecatedBlock, + kAuto, +}; + +BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode) + : mode_(mode) { + // Sanitize bits_per_key + if (bits_per_key < 1.0) { + bits_per_key = 1.0; + } else if (!(bits_per_key < 100.0)) { // including NaN + bits_per_key = 100.0; + } + + // Includes a nudge toward rounding up, to ensure on all platforms + // that doubles specified with three decimal digits after the decimal + // point are interpreted accurately. + millibits_per_key_ = static_cast(bits_per_key * 1000.0 + 0.500001); + + // For better or worse, this is a rounding up of a nudged rounding up, + // e.g. 7.4999999999999 will round up to 8, but that provides more + // predictability against small arithmetic errors in floating point. + whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000; +} + +BloomFilterPolicy::~BloomFilterPolicy() {} + +const char* BloomFilterPolicy::Name() const { + return "rocksdb.BuiltinBloomFilter"; +} + +void BloomFilterPolicy::CreateFilter(const Slice* keys, int n, + std::string* dst) const { + // We should ideally only be using this deprecated interface for + // appropriately constructed BloomFilterPolicy + assert(mode_ == kDeprecatedBlock); + + // Compute bloom filter size (in both bits and bytes) + uint32_t bits = static_cast(n * whole_bits_per_key_); + + // For small n, we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if (bits < 64) bits = 64; + + uint32_t bytes = (bits + 7) / 8; + bits = bytes * 8; + + int num_probes = + LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_); + + const size_t init_size = dst->size(); + dst->resize(init_size + bytes, 0); + dst->push_back(static_cast(num_probes)); // Remember # of probes + char* array = &(*dst)[init_size]; + for (int i = 0; i < n; i++) { + LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes, + array); + } +} + +bool BloomFilterPolicy::KeyMayMatch(const Slice& key, + const Slice& bloom_filter) const { + const size_t len = bloom_filter.size(); + if (len < 2 || len > 0xffffffffU) { + return false; + } + + const char* array = bloom_filter.data(); + const uint32_t bits = static_cast(len - 1) * 8; + + // Use the encoded k so that we can read filters generated by + // bloom filters created using different parameters. + const int k = static_cast(array[len - 1]); + if (k > 30) { + // Reserved for potentially new encodings for short bloom filters. + // Consider it a match. + return true; + } + // NB: using stored k not num_probes for whole_bits_per_key_ + return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k, + array); +} + +FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const { + // This code path should no longer be used, for the built-in + // BloomFilterPolicy. Internal to RocksDB and outside + // BloomFilterPolicy, only get a FilterBitsBuilder with + // BloomFilterPolicy::GetBuilderFromContext(), which will call + // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have + // been warned (HISTORY.md) that they can no longer call this on + // the built-in BloomFilterPolicy (unlikely). + assert(false); + return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions())); +} + +FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( + const FilterBuildingContext& context) const { + Mode cur = mode_; + // Unusual code construction so that we can have just + // one exhaustive switch without (risky) recursion + for (int i = 0; i < 2; ++i) { + switch (cur) { + case kAuto: + if (context.table_options.format_version < 5) { + cur = kLegacyBloom; + } else { + cur = kFastLocalBloom; + } + break; + case kDeprecatedBlock: + return nullptr; + case kFastLocalBloom: + return new FastLocalBloomBitsBuilder(millibits_per_key_); + case kLegacyBloom: + return new LegacyBloomBitsBuilder(whole_bits_per_key_); + } + } + assert(false); + return nullptr; // something legal +} + +FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext( + const FilterBuildingContext& context) { + if (context.table_options.filter_policy) { + return context.table_options.filter_policy->GetBuilderWithContext(context); + } else { + return nullptr; + } +} + +// Read metadata to determine what kind of FilterBitsReader is needed +// and return a new one. +FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast(contents.size()); + if (len_with_meta <= 5) { + // filter is empty or broken. Treat like zero keys added. + return new AlwaysFalseFilter(); + } + + // Legacy Bloom filter data: + // 0 +-----------------------------------+ + // | Raw Bloom filter data | + // | ... | + // len +-----------------------------------+ + // | byte for num_probes or | + // | marker for new implementations | + // len+1 +-----------------------------------+ + // | four bytes for number of cache | + // | lines | + // len_with_meta +-----------------------------------+ + + int8_t raw_num_probes = + static_cast(contents.data()[len_with_meta - 5]); + // NB: *num_probes > 30 and < 128 probably have not been used, because of + // BloomFilterPolicy::initialize, unless directly calling + // LegacyBloomBitsBuilder as an API, but we are leaving those cases in + // limbo with LegacyBloomBitsReader for now. + + if (raw_num_probes < 1) { + // Note: < 0 (or unsigned > 127) indicate special new implementations + // (or reserved for future use) + if (raw_num_probes == -1) { + // Marker for newer Bloom implementations + return GetBloomBitsReader(contents); + } + // otherwise + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } + // else attempt decode for LegacyBloomBitsReader + + int num_probes = raw_num_probes; + assert(num_probes >= 1); + assert(num_probes <= 127); + + uint32_t len = len_with_meta - 5; + assert(len > 0); + + uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4); + uint32_t log2_cache_line_size; + + if (num_lines * CACHE_LINE_SIZE == len) { + // Common case + log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE); + } else if (num_lines == 0 || len % num_lines != 0) { + // Invalid (no solution to num_lines * x == len) + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } else { + // Determine the non-native cache line size (from another system) + log2_cache_line_size = 0; + while ((num_lines << log2_cache_line_size) < len) { + ++log2_cache_line_size; + } + if ((num_lines << log2_cache_line_size) != len) { + // Invalid (block size not a power of two) + // Treat as zero probes (always FP) for now. + return new AlwaysTrueFilter(); + } + } + // if not early return + return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines, + log2_cache_line_size); +} + +// For newer Bloom filter implementations +FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader( + const Slice& contents) const { + uint32_t len_with_meta = static_cast(contents.size()); + uint32_t len = len_with_meta - 5; + + assert(len > 0); // precondition + + // New Bloom filter data: + // 0 +-----------------------------------+ + // | Raw Bloom filter data | + // | ... | + // len +-----------------------------------+ + // | char{-1} byte -> new Bloom filter | + // len+1 +-----------------------------------+ + // | byte for subimplementation | + // | 0: FastLocalBloom | + // | other: reserved | + // len+2 +-----------------------------------+ + // | byte for block_and_probes | + // | 0 in top 3 bits -> 6 -> 64-byte | + // | reserved: | + // | 1 in top 3 bits -> 7 -> 128-byte| + // | 2 in top 3 bits -> 8 -> 256-byte| + // | ... | + // | num_probes in bottom 5 bits, | + // | except 0 and 31 reserved | + // len+3 +-----------------------------------+ + // | two bytes reserved | + // | possibly for hash seed | + // len_with_meta +-----------------------------------+ + + // Read more metadata (see above) + char sub_impl_val = contents.data()[len_with_meta - 4]; + char block_and_probes = contents.data()[len_with_meta - 3]; + int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6; + + int num_probes = (block_and_probes & 31); + if (num_probes < 1 || num_probes > 30) { + // Reserved / future safe + return new AlwaysTrueFilter(); + } + + uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2); + if (rest != 0) { + // Reserved, possibly for hash seed + // Future safe + return new AlwaysTrueFilter(); + } + + if (sub_impl_val == 0) { // FastLocalBloom + if (log2_block_bytes == 6) { // Only block size supported for now + return new FastLocalBloomBitsReader(contents.data(), num_probes, len); + } + } + // otherwise + // Reserved / future safe + return new AlwaysTrueFilter(); +} + +const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, + bool use_block_based_builder) { + BloomFilterPolicy::Mode m; + if (use_block_based_builder) { + m = BloomFilterPolicy::kDeprecatedBlock; + } else { + m = BloomFilterPolicy::kAuto; + } + assert(std::find(BloomFilterPolicy::kAllUserModes.begin(), + BloomFilterPolicy::kAllUserModes.end(), + m) != BloomFilterPolicy::kAllUserModes.end()); + return new BloomFilterPolicy(bits_per_key, m); +} + +FilterBuildingContext::FilterBuildingContext( + const BlockBasedTableOptions& _table_options) + : table_options(_table_options) {} + +FilterPolicy::~FilterPolicy() { } + +} // namespace rocksdb diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h new file mode 100644 index 00000000000..b92980a546a --- /dev/null +++ b/table/block_based/filter_policy_internal.h @@ -0,0 +1,132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include + +#include "rocksdb/filter_policy.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +class Slice; + +// Exposes any extra information needed for testing built-in +// FilterBitsBuilders +class BuiltinFilterBitsBuilder : public FilterBitsBuilder { + public: + // Calculate number of bytes needed for a new filter, including + // metadata. Passing the result to CalculateNumEntry should + // return >= the num_entry passed in. + virtual uint32_t CalculateSpace(const int num_entry) = 0; +}; + +// RocksDB built-in filter policy for Bloom or Bloom-like filters. +// This class is considered internal API and subject to change. +// See NewBloomFilterPolicy. +class BloomFilterPolicy : public FilterPolicy { + public: + // An internal marker for operating modes of BloomFilterPolicy, in terms + // of selecting an implementation. This makes it easier for tests to track + // or to walk over the built-in set of Bloom filter implementations. The + // only variance in BloomFilterPolicy by mode/implementation is in + // GetFilterBitsBuilder(), so an enum is practical here vs. subclasses. + // + // This enum is essentially the union of all the different kinds of return + // value from GetFilterBitsBuilder, or "underlying implementation", and + // higher-level modes that choose an underlying implementation based on + // context information. + enum Mode { + // Legacy implementation of Bloom filter for full and partitioned filters. + // Set to 0 in case of value confusion with bool use_block_based_builder + // NOTE: TESTING ONLY as this mode does not use best compatible + // implementation + kLegacyBloom = 0, + // Deprecated block-based Bloom filter implementation. + // Set to 1 in case of value confusion with bool use_block_based_builder + // NOTE: DEPRECATED but user exposed + kDeprecatedBlock = 1, + // A fast, cache-local Bloom filter implementation. See description in + // FastLocalBloomImpl. + // NOTE: TESTING ONLY as this mode does not check format_version + kFastLocalBloom = 2, + // Automatically choose from the above (except kDeprecatedBlock) based on + // context at build time, including compatibility with format_version. + // NOTE: This is currently the only recommended mode that is user exposed. + kAuto = 100, + }; + // All the different underlying implementations that a BloomFilterPolicy + // might use, as a mode that says "always use this implementation." + // Only appropriate for unit tests. + static const std::vector kAllFixedImpls; + + // All the different modes of BloomFilterPolicy that are exposed from + // user APIs. Only appropriate for higher-level unit tests. Integration + // tests should prefer using NewBloomFilterPolicy (user-exposed). + static const std::vector kAllUserModes; + + explicit BloomFilterPolicy(double bits_per_key, Mode mode); + + ~BloomFilterPolicy() override; + + const char* Name() const override; + + // Deprecated block-based filter only + void CreateFilter(const Slice* keys, int n, std::string* dst) const override; + + // Deprecated block-based filter only + bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override; + + FilterBitsBuilder* GetFilterBitsBuilder() const override; + + // To use this function, call GetBuilderFromContext(). + // + // Neither the context nor any objects therein should be saved beyond + // the call to this function, unless it's shared_ptr. + FilterBitsBuilder* GetBuilderWithContext( + const FilterBuildingContext&) const override; + + // Returns a new FilterBitsBuilder from the filter_policy in + // table_options of a context, or nullptr if not applicable. + // (An internal convenience function to save boilerplate.) + static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&); + + // Read metadata to determine what kind of FilterBitsReader is needed + // and return a new one. This must successfully process any filter data + // generated by a built-in FilterBitsBuilder, regardless of the impl + // chosen for this BloomFilterPolicy. Not compatible with CreateFilter. + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override; + + // Essentially for testing only: configured millibits/key + int GetMillibitsPerKey() const { return millibits_per_key_; } + // Essentially for testing only: legacy whole bits/key + int GetWholeBitsPerKey() const { return whole_bits_per_key_; } + + private: + // Newer filters support fractional bits per key. For predictable behavior + // of 0.001-precision values across floating point implementations, we + // round to thousandths of a bit (on average) per key. + int millibits_per_key_; + + // Older filters round to whole number bits per key. (There *should* be no + // compatibility issue with fractional bits per key, but preserving old + // behavior with format_version < 5 just in case.) + int whole_bits_per_key_; + + // Selected mode (a specific implementation or way of selecting an + // implementation) for building new SST filters. + Mode mode_; + + // For newer Bloom filter implementation(s) + FilterBitsReader* GetBloomBitsReader(const Slice& contents) const; +}; + +} // namespace rocksdb diff --git a/table/flush_block_policy.cc b/table/block_based/flush_block_policy.cc similarity index 98% rename from table/flush_block_policy.cc rename to table/block_based/flush_block_policy.cc index 1b1675828da..31576848c07 100644 --- a/table/flush_block_policy.cc +++ b/table/block_based/flush_block_policy.cc @@ -6,7 +6,7 @@ #include "rocksdb/flush_block_policy.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" #include "table/format.h" #include diff --git a/table/flush_block_policy.h b/table/block_based/flush_block_policy.h similarity index 100% rename from table/flush_block_policy.h rename to table/block_based/flush_block_policy.h diff --git a/table/full_filter_block.cc b/table/block_based/full_filter_block.cc similarity index 52% rename from table/full_filter_block.cc rename to table/block_based/full_filter_block.cc index 9015e96d2ea..b3b2f58136b 100644 --- a/table/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -3,27 +3,22 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "table/full_filter_block.h" - -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif +#include "table/block_based/full_filter_block.h" +#include #include "monitoring/perf_context_imp.h" +#include "port/malloc.h" #include "port/port.h" #include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" #include "util/coding.h" namespace rocksdb { FullFilterBlockBuilder::FullFilterBlockBuilder( - const SliceTransform* prefix_extractor, bool whole_key_filtering, + const SliceTransform* _prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder) - : prefix_extractor_(prefix_extractor), + : prefix_extractor_(_prefix_extractor), whole_key_filtering_(whole_key_filtering), last_whole_key_recorded_(false), last_prefix_recorded_(false), @@ -62,7 +57,7 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) { } // Add prefix to filter if needed -inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) { +void FullFilterBlockBuilder::AddPrefix(const Slice& key) { Slice prefix = prefix_extractor_->Transform(key); if (whole_key_filtering_) { // if both whole_key and prefix are added to bloom then we will have whole @@ -98,57 +93,87 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/, } FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - const Slice& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats) - : FilterBlockReader(contents.size(), stats, _whole_key_filtering), - prefix_extractor_(prefix_extractor), - contents_(contents) { - assert(filter_bits_reader != nullptr); - filter_bits_reader_.reset(filter_bits_reader); - if (prefix_extractor_ != nullptr) { + const BlockBasedTable* t, + CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) { + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (prefix_extractor) { full_length_enabled_ = - prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_); + prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_); } } -FullFilterBlockReader::FullFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - BlockContents&& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats) - : FullFilterBlockReader(prefix_extractor, _whole_key_filtering, - contents.data, filter_bits_reader, stats) { - block_contents_ = std::move(contents); -} - bool FullFilterBlockReader::KeyMayMatch( const Slice& key, const SliceTransform* /*prefix_extractor*/, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)block_offset; #endif assert(block_offset == kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { return true; } - return MayMatch(key); + return MayMatch(key, no_io, get_context, lookup_context); +} + +std::unique_ptr FullFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + use_cache, nullptr /* get_context */, + lookup_context, &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new FullFilterBlockReader(table, std::move(filter_block))); } bool FullFilterBlockReader::PrefixMayMatch( const Slice& prefix, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/, - const Slice* const /*const_ikey_ptr*/) { + uint64_t block_offset, const bool no_io, + const Slice* const /*const_ikey_ptr*/, GetContext* get_context, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)block_offset; #endif assert(block_offset == kNotValid); - return MayMatch(prefix); + return MayMatch(prefix, no_io, get_context, lookup_context); } -bool FullFilterBlockReader::MayMatch(const Slice& entry) { - if (contents_.size() != 0) { - if (filter_bits_reader_->MayMatch(entry)) { +bool FullFilterBlockReader::MayMatch( + const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (!s.ok()) { + return true; + } + + assert(filter_block.GetValue()); + + FilterBitsReader* const filter_bits_reader = + filter_block.GetValue()->filter_bits_reader(); + + if (filter_bits_reader) { + if (filter_bits_reader->MayMatch(entry)) { PERF_COUNTER_ADD(bloom_sst_hit_count, 1); return true; } else { @@ -161,33 +186,50 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) { void FullFilterBlockReader::KeysMayMatch( MultiGetRange* range, const SliceTransform* /*prefix_extractor*/, - uint64_t block_offset, const bool /*no_io*/) { + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)range; (void)block_offset; #endif assert(block_offset == kNotValid); - if (!whole_key_filtering_) { + if (!whole_key_filtering()) { // Simply return. Don't skip any key - consider all keys as likely to be // present return; } - MayMatch(range); + MayMatch(range, no_io, nullptr, lookup_context); } void FullFilterBlockReader::PrefixesMayMatch( - MultiGetRange* range, const SliceTransform* /* prefix_extractor */, - uint64_t block_offset, const bool /*no_io*/) { + MultiGetRange* range, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) { #ifdef NDEBUG (void)range; (void)block_offset; #endif assert(block_offset == kNotValid); - MayMatch(range); + MayMatch(range, no_io, prefix_extractor, lookup_context); } -void FullFilterBlockReader::MayMatch(MultiGetRange* range) { - if (contents_.size() == 0) { +void FullFilterBlockReader::MayMatch( + MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const { + CachableEntry filter_block; + + const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, + lookup_context, &filter_block); + if (!s.ok()) { + return; + } + + assert(filter_block.GetValue()); + + FilterBitsReader* const filter_bits_reader = + filter_block.GetValue()->filter_bits_reader(); + + if (!filter_bits_reader) { return; } @@ -195,39 +237,55 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range) { // &may_match[0] doesn't work for autovector (compiler error). So // declare both keys and may_match as arrays, which is also slightly less // expensive compared to autovector - Slice* keys[MultiGetContext::MAX_BATCH_SIZE]; - bool may_match[MultiGetContext::MAX_BATCH_SIZE] = {false}; + std::array keys; + std::array may_match = {{true}}; + autovector prefixes; int num_keys = 0; - for (auto iter = range->begin(); iter != range->end(); ++iter) { - keys[num_keys++] = &iter->ukey; + MultiGetRange filter_range(*range, range->begin(), range->end()); + for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { + if (!prefix_extractor) { + keys[num_keys++] = &iter->ukey; + } else if (prefix_extractor->InDomain(iter->ukey)) { + prefixes.emplace_back(prefix_extractor->Transform(iter->ukey)); + keys[num_keys++] = &prefixes.back(); + } else { + filter_range.SkipKey(iter); + } } - filter_bits_reader_->MayMatch(num_keys, &keys[0], &may_match[0]); + + filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]); int i = 0; - for (auto iter = range->begin(); iter != range->end(); ++iter) { + for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) { if (!may_match[i]) { + // Update original MultiGet range to skip this key. The filter_range + // was temporarily used just to skip keys not in prefix_extractor domain range->SkipKey(iter); + PERF_COUNTER_ADD(bloom_sst_miss_count, 1); + } else { + // PERF_COUNTER_ADD(bloom_sst_hit_count, 1); + PerfContext* perf_ctx = get_perf_context(); + perf_ctx->bloom_sst_hit_count++; } ++i; } } size_t FullFilterBlockReader::ApproximateMemoryUsage() const { - size_t usage = block_contents_.usable_size(); + size_t usage = ApproximateFilterBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); - usage += malloc_usable_size(filter_bits_reader_.get()); + usage += malloc_usable_size(const_cast(this)); #else usage += sizeof(*this); - usage += sizeof(*filter_bits_reader_.get()); #endif // ROCKSDB_MALLOC_USABLE_SIZE return usage; } -bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound, - const Slice& user_key, const SliceTransform* prefix_extractor, - const Comparator* comparator, const Slice* const const_ikey_ptr, - bool* filter_checked, bool need_upper_bound_check) { +bool FullFilterBlockReader::RangeMayExist( + const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) { if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { *filter_checked = false; return true; @@ -240,22 +298,23 @@ bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound, } else { *filter_checked = true; return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, - const_ikey_ptr); + const_ikey_ptr, /* get_context */ nullptr, + lookup_context); } } bool FullFilterBlockReader::IsFilterCompatible( const Slice* iterate_upper_bound, const Slice& prefix, - const Comparator* comparator) { + const Comparator* comparator) const { // Try to reuse the bloom filter in the SST table if prefix_extractor in // mutable_cf_options has changed. If range [user_key, upper_bound) all // share the same prefix then we may still be able to use the bloom filter. - if (iterate_upper_bound != nullptr && prefix_extractor_) { - if (!prefix_extractor_->InDomain(*iterate_upper_bound)) { + const SliceTransform* const prefix_extractor = table_prefix_extractor(); + if (iterate_upper_bound != nullptr && prefix_extractor) { + if (!prefix_extractor->InDomain(*iterate_upper_bound)) { return false; } - Slice upper_bound_xform = - prefix_extractor_->Transform(*iterate_upper_bound); + Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound); // first check if user_key and upper_bound all share the same prefix if (!comparator->Equal(prefix, upper_bound_xform)) { // second check if user_key's prefix is the immediate predecessor of diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h new file mode 100644 index 00000000000..04f1ec22849 --- /dev/null +++ b/table/block_based/full_filter_block.h @@ -0,0 +1,139 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "util/hash.h" + +namespace rocksdb { + +class FilterPolicy; +class FilterBitsBuilder; +class FilterBitsReader; + +// A FullFilterBlockBuilder is used to construct a full filter for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// The format of full filter block is: +// +----------------------------------------------------------------+ +// | full filter for all keys in sst file | +// +----------------------------------------------------------------+ +// The full filter can be very large. At the end of it, we put +// num_probes: how many hash functions are used in bloom filter +// +class FullFilterBlockBuilder : public FilterBlockBuilder { + public: + explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, + bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder); + // No copying allowed + FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete; + void operator=(const FullFilterBlockBuilder&) = delete; + + // bits_builder is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockBuilder() {} + + virtual bool IsBlockBased() override { return false; } + virtual void StartBlock(uint64_t /*block_offset*/) override {} + virtual void Add(const Slice& key) override; + virtual size_t NumAdded() const override { return num_added_; } + virtual Slice Finish(const BlockHandle& tmp, Status* status) override; + using FilterBlockBuilder::Finish; + + protected: + virtual void AddKey(const Slice& key); + std::unique_ptr filter_bits_builder_; + virtual void Reset(); + void AddPrefix(const Slice& key); + const SliceTransform* prefix_extractor() { return prefix_extractor_; } + + private: + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + bool last_whole_key_recorded_; + std::string last_whole_key_str_; + bool last_prefix_recorded_; + std::string last_prefix_str_; + + uint32_t num_added_; + std::unique_ptr filter_data_; + +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class FullFilterBlockReader + : public FilterBlockReaderCommon { + public: + FullFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); + + bool IsBlockBased() override { return false; } + + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + void KeysMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) override; + + void PrefixesMayMatch(MultiGetRange* range, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + BlockCacheLookupContext* lookup_context) override; + size_t ApproximateMemoryUsage() const override; + bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check, + BlockCacheLookupContext* lookup_context) override; + + private: + bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context) const; + void MayMatch(MultiGetRange* range, bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context) const; + bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) const; + + private: + bool full_length_enabled_; + size_t prefix_extractor_full_length_; +}; + +} // namespace rocksdb diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc new file mode 100644 index 00000000000..bde19121c95 --- /dev/null +++ b/table/block_based/full_filter_block_test.cc @@ -0,0 +1,333 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "table/block_based/full_filter_block.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/mock_block_based_table.h" +#include "table/block_based/filter_policy_internal.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/string_util.h" + +namespace rocksdb { + +class TestFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit TestFilterBitsBuilder() {} + + // Add Key to filter + void AddKey(const Slice& key) override { + hash_entries_.push_back(Hash(key.data(), key.size(), 1)); + } + + // Generate the filter using the keys that are added + Slice Finish(std::unique_ptr* buf) override { + uint32_t len = static_cast(hash_entries_.size()) * 4; + char* data = new char[len]; + for (size_t i = 0; i < hash_entries_.size(); i++) { + EncodeFixed32(data + i * 4, hash_entries_[i]); + } + const char* const_data = data; + buf->reset(const_data); + return Slice(data, len); + } + + private: + std::vector hash_entries_; +}; + +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} +}; + +class TestFilterBitsReader : public FilterBitsReader { + public: + explicit TestFilterBitsReader(const Slice& contents) + : data_(contents.data()), len_(static_cast(contents.size())) {} + + // Silence compiler warning about overloaded virtual + using FilterBitsReader::MayMatch; + bool MayMatch(const Slice& entry) override { + uint32_t h = Hash(entry.data(), entry.size(), 1); + for (size_t i = 0; i + 4 <= len_; i += 4) { + if (h == DecodeFixed32(data_ + i)) { + return true; + } + } + return false; + } + + private: + const char* data_; + uint32_t len_; +}; + + +class TestHashFilter : public FilterPolicy { + public: + const char* Name() const override { return "TestHashFilter"; } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const override { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + bool KeyMayMatch(const Slice& key, const Slice& filter) const override { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } + + FilterBitsBuilder* GetFilterBitsBuilder() const override { + return new TestFilterBitsBuilder(); + } + + FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { + return new TestFilterBitsReader(contents); + } +}; + +class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester, + public testing::Test { + public: + PluginFullFilterBlockTest() + : mock::MockBlockBasedTableTester(new TestHashFilter) {} +}; + +TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { + FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); + Slice slice = builder.Finish(); + ASSERT_EQ("", EscapeString(slice)); + + CachableEntry block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(slice)), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { + FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + Slice slice = builder.Finish(); + + CachableEntry block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(slice)), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +class FullFilterBlockTest : public mock::MockBlockBasedTableTester, + public testing::Test { + public: + FullFilterBlockTest() + : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, false)) {} +}; + +TEST_F(FullFilterBlockTest, EmptyBuilder) { + FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); + Slice slice = builder.Finish(); + ASSERT_EQ("", EscapeString(slice)); + + CachableEntry block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(slice)), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder { + std::unique_ptr b_; + std::set uniq_; + + public: + explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {} + + ~CountUniqueFilterBitsBuilderWrapper() override {} + + void AddKey(const Slice& key) override { + b_->AddKey(key); + uniq_.insert(key.ToString()); + } + + Slice Finish(std::unique_ptr* buf) override { + Slice rv = b_->Finish(buf); + uniq_.clear(); + return rv; + } + + int CalculateNumEntry(const uint32_t bytes) override { + return b_->CalculateNumEntry(bytes); + } + + size_t CountUnique() { return uniq_.size(); } +}; + +TEST_F(FullFilterBlockTest, DuplicateEntries) { + { // empty prefixes + std::unique_ptr prefix_extractor( + NewFixedPrefixTransform(0)); + auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder()); + const bool WHOLE_KEY = true; + FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, + bits_builder); + ASSERT_EQ(0, builder.NumAdded()); + ASSERT_EQ(0, bits_builder->CountUnique()); + // adds key and empty prefix; both abstractions count them + builder.Add("key1"); + ASSERT_EQ(2, builder.NumAdded()); + ASSERT_EQ(2, bits_builder->CountUnique()); + // Add different key (unique) and also empty prefix (not unique). + // From here in this test, it's immaterial whether the block builder + // can count unique keys. + builder.Add("key2"); + ASSERT_EQ(3, bits_builder->CountUnique()); + // Empty key -> nothing unique + builder.Add(""); + ASSERT_EQ(3, bits_builder->CountUnique()); + } + + // mix of empty and non-empty + std::unique_ptr prefix_extractor( + NewFixedPrefixTransform(7)); + auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder()); + const bool WHOLE_KEY = true; + FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, + bits_builder); + ASSERT_EQ(0, builder.NumAdded()); + builder.Add(""); // test with empty key too + builder.Add("prefix1key1"); + builder.Add("prefix1key1"); + builder.Add("prefix1key2"); + builder.Add("prefix1key3"); + builder.Add("prefix2key4"); + // 1 empty, 2 non-empty prefixes, and 4 non-empty keys + ASSERT_EQ(1 + 2 + 4, bits_builder->CountUnique()); +} + +TEST_F(FullFilterBlockTest, SingleChunk) { + FullFilterBlockBuilder builder(nullptr, true, GetBuilder()); + ASSERT_EQ(0, builder.NumAdded()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + ASSERT_EQ(5, builder.NumAdded()); + Slice slice = builder.Finish(); + + CachableEntry block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(slice)), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + + FullFilterBlockReader reader(table_.get(), std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr, + /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + ASSERT_TRUE(!reader.KeyMayMatch( + "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid, + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/table/index_builder.cc b/table/block_based/index_builder.cc similarity index 93% rename from table/index_builder.cc rename to table/block_based/index_builder.cc index 63cb80598fe..f3a4b10e01e 100644 --- a/table/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -7,17 +7,18 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/index_builder.h" +#include "table/block_based/index_builder.h" + #include -#include +#include #include #include #include "rocksdb/comparator.h" #include "rocksdb/flush_block_policy.h" +#include "table/block_based/partitioned_filter_block.h" #include "table/format.h" -#include "table/partitioned_filter_block.h" // Without anonymous namespace here, we fail the warning -Wmissing-prototypes namespace rocksdb { @@ -35,7 +36,7 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( result = new ShortenedIndexBuilder( comparator, table_opt.index_block_restart_interval, table_opt.format_version, use_value_delta_encoding, - table_opt.index_shortening); + table_opt.index_shortening, /* include_first_key */ false); } break; case BlockBasedTableOptions::kHashSearch: { result = new HashIndexBuilder( @@ -47,6 +48,12 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( result = PartitionedIndexBuilder::CreateIndexBuilder( comparator, use_value_delta_encoding, table_opt); } break; + case BlockBasedTableOptions::kBinarySearchWithFirstKey: { + result = new ShortenedIndexBuilder( + comparator, table_opt.index_block_restart_interval, + table_opt.format_version, use_value_delta_encoding, + table_opt.index_shortening, /* include_first_key */ true); + } break; default: { assert(!"Do not recognize the index type "); } break; @@ -93,7 +100,7 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { sub_index_builder_ = new ShortenedIndexBuilder( comparator_, table_opt_.index_block_restart_interval, table_opt_.format_version, use_value_delta_encoding_, - table_opt_.index_shortening); + table_opt_.index_shortening, /* include_first_key */ false); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, // Note: this is sub-optimal since sub_index_builder_ could later reset diff --git a/table/index_builder.h b/table/block_based/index_builder.h similarity index 90% rename from table/index_builder.h rename to table/block_based/index_builder.h index 2f349fc5471..47348b31f78 100644 --- a/table/index_builder.h +++ b/table/block_based/index_builder.h @@ -10,15 +10,15 @@ #pragma once #include -#include +#include #include #include #include #include "rocksdb/comparator.h" -#include "table/block_based_table_factory.h" -#include "table/block_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_builder.h" #include "table/format.h" namespace rocksdb { @@ -58,6 +58,7 @@ class IndexBuilder { // To allow further optimization, we provide `last_key_in_current_block` and // `first_key_in_next_block`, based on which the specific implementation can // determine the best index key to be used for the index block. + // Called before the OnKeyAdded() call for first_key_in_next_block. // @last_key_in_current_block: this parameter maybe overridden with the value // "substitute key". // @first_key_in_next_block: it will be nullptr if the entry being added is @@ -123,7 +124,8 @@ class ShortenedIndexBuilder : public IndexBuilder { const InternalKeyComparator* comparator, const int index_block_restart_interval, const uint32_t format_version, const bool use_value_delta_encoding, - BlockBasedTableOptions::IndexShorteningMode shortening_mode) + BlockBasedTableOptions::IndexShorteningMode shortening_mode, + bool include_first_key) : IndexBuilder(comparator), index_block_builder_(index_block_restart_interval, true /*use_delta_encoding*/, @@ -131,11 +133,19 @@ class ShortenedIndexBuilder : public IndexBuilder { index_block_builder_without_seq_(index_block_restart_interval, true /*use_delta_encoding*/, use_value_delta_encoding), + use_value_delta_encoding_(use_value_delta_encoding), + include_first_key_(include_first_key), shortening_mode_(shortening_mode) { // Making the default true will disable the feature for old versions seperator_is_key_plus_seq_ = (format_version <= 2); } + virtual void OnKeyAdded(const Slice& key) override { + if (include_first_key_ && current_block_first_internal_key_.empty()) { + current_block_first_internal_key_.assign(key.data(), key.size()); + } + } + virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle) override { @@ -159,20 +169,27 @@ class ShortenedIndexBuilder : public IndexBuilder { } auto sep = Slice(*last_key_in_current_block); - std::string handle_encoding; - block_handle.EncodeTo(&handle_encoding); - std::string handle_delta_encoding; - PutVarsignedint64(&handle_delta_encoding, - block_handle.size() - last_encoded_handle_.size()); - assert(handle_delta_encoding.size() != 0); + assert(!include_first_key_ || !current_block_first_internal_key_.empty()); + IndexValue entry(block_handle, current_block_first_internal_key_); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, include_first_key_, nullptr); + if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) { + entry.EncodeTo(&delta_encoded_entry, include_first_key_, + &last_encoded_handle_); + } else { + // If it's the first block, or delta encoding is disabled, + // BlockBuilder::Add() below won't use delta-encoded slice. + } last_encoded_handle_ = block_handle; - const Slice handle_delta_encoding_slice(handle_delta_encoding); - index_block_builder_.Add(sep, handle_encoding, - &handle_delta_encoding_slice); + const Slice delta_encoded_entry_slice(delta_encoded_entry); + index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice); if (!seperator_is_key_plus_seq_) { - index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding, - &handle_delta_encoding_slice); + index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry, + &delta_encoded_entry_slice); } + + current_block_first_internal_key_.clear(); } using IndexBuilder::Finish; @@ -200,9 +217,12 @@ class ShortenedIndexBuilder : public IndexBuilder { private: BlockBuilder index_block_builder_; BlockBuilder index_block_builder_without_seq_; + const bool use_value_delta_encoding_; bool seperator_is_key_plus_seq_; + const bool include_first_key_; BlockBasedTableOptions::IndexShorteningMode shortening_mode_; - BlockHandle last_encoded_handle_; + BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle(); + std::string current_block_first_internal_key_; }; // HashIndexBuilder contains a binary-searchable primary index and the @@ -243,7 +263,7 @@ class HashIndexBuilder : public IndexBuilder { : IndexBuilder(comparator), primary_index_builder_(comparator, index_block_restart_interval, format_version, use_value_delta_encoding, - shortening_mode), + shortening_mode, /* include_first_key */ false), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h new file mode 100644 index 00000000000..52891b1bdad --- /dev/null +++ b/table/block_based/mock_block_based_table.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include "rocksdb/filter_policy.h" +#include "table/block_based/block_based_filter_block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/filter_policy_internal.h" + +namespace rocksdb { +namespace mock { + +class MockBlockBasedTable : public BlockBasedTable { + public: + explicit MockBlockBasedTable(Rep* rep) + : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {} +}; + +class MockBlockBasedTableTester { + static constexpr int kMockLevel = 0; + + public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; + BlockBasedTableOptions table_options_; + InternalKeyComparator icomp_; + std::unique_ptr table_; + + MockBlockBasedTableTester(const FilterPolicy *filter_policy) + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator) { + table_options_.filter_policy.reset(filter_policy); + + constexpr bool skip_filters = false; + constexpr bool immortal_table = false; + table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep( + ioptions_, env_options_, table_options_, icomp_, skip_filters, + kMockLevel, immortal_table))); + } + + FilterBitsBuilder* GetBuilder() const { + FilterBuildingContext context(table_options_); + context.column_family_name = "mock_cf"; + context.compaction_style = ioptions_.compaction_style; + context.level_at_creation = kMockLevel; + return BloomFilterPolicy::GetBuilderFromContext(context); + } +}; + +} // namespace mock +} // namespace rocksdb diff --git a/table/block_based/parsed_full_filter_block.cc b/table/block_based/parsed_full_filter_block.cc new file mode 100644 index 00000000000..5cc259d1906 --- /dev/null +++ b/table/block_based/parsed_full_filter_block.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/parsed_full_filter_block.h" +#include "rocksdb/filter_policy.h" + +namespace rocksdb { + +ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy, + BlockContents&& contents) + : block_contents_(std::move(contents)), + filter_bits_reader_( + !block_contents_.data.empty() + ? filter_policy->GetFilterBitsReader(block_contents_.data) + : nullptr) {} + +ParsedFullFilterBlock::~ParsedFullFilterBlock() = default; + +} // namespace rocksdb diff --git a/table/block_based/parsed_full_filter_block.h b/table/block_based/parsed_full_filter_block.h new file mode 100644 index 00000000000..8f15f935058 --- /dev/null +++ b/table/block_based/parsed_full_filter_block.h @@ -0,0 +1,40 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "table/format.h" + +namespace rocksdb { + +class FilterBitsReader; +class FilterPolicy; + +// The sharable/cachable part of the full filter. +class ParsedFullFilterBlock { + public: + ParsedFullFilterBlock(const FilterPolicy* filter_policy, + BlockContents&& contents); + ~ParsedFullFilterBlock(); + + FilterBitsReader* filter_bits_reader() const { + return filter_bits_reader_.get(); + } + + // TODO: consider memory usage of the FilterBitsReader + size_t ApproximateMemoryUsage() const { + return block_contents_.ApproximateMemoryUsage(); + } + + bool own_bytes() const { return block_contents_.own_bytes(); } + + private: + BlockContents block_contents_; + std::unique_ptr filter_bits_reader_; +}; + +} // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc new file mode 100644 index 00000000000..b9b96989fde --- /dev/null +++ b/table/block_based/partitioned_filter_block.cc @@ -0,0 +1,390 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/partitioned_filter_block.h" + +#include + +#include "monitoring/perf_context_imp.h" +#include "port/malloc.h" +#include "port/port.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/coding.h" + +namespace rocksdb { + +PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( + const SliceTransform* _prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size) + : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering, + filter_bits_builder), + index_on_filter_block_builder_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + index_on_filter_block_builder_without_seq_(index_block_restart_interval, + true /*use_delta_encoding*/, + use_value_delta_encoding), + p_index_builder_(p_index_builder), + filters_in_partition_(0), + num_added_(0) { + filters_per_partition_ = + filter_bits_builder_->CalculateNumEntry(partition_size); +} + +PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} + +void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock( + const Slice* next_key) { + // Use == to send the request only once + if (filters_in_partition_ == filters_per_partition_) { + // Currently only index builder is in charge of cutting a partition. We keep + // requesting until it is granted. + p_index_builder_->RequestPartitionCut(); + } + if (!p_index_builder_->ShouldCutFilterBlock()) { + return; + } + filter_gc.push_back(std::unique_ptr(nullptr)); + + // Add the prefix of the next key before finishing the partition. This hack, + // fixes a bug with format_verison=3 where seeking for the prefix would lead + // us to the previous partition. + const bool add_prefix = + next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key); + if (add_prefix) { + FullFilterBlockBuilder::AddPrefix(*next_key); + } + + Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); + std::string& index_key = p_index_builder_->GetPartitionKey(); + filters.push_back({index_key, filter}); + filters_in_partition_ = 0; + Reset(); +} + +void PartitionedFilterBlockBuilder::Add(const Slice& key) { + MaybeCutAFilterBlock(&key); + FullFilterBlockBuilder::Add(key); +} + +void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { + filter_bits_builder_->AddKey(key); + filters_in_partition_++; + num_added_++; +} + +Slice PartitionedFilterBlockBuilder::Finish( + const BlockHandle& last_partition_block_handle, Status* status) { + if (finishing_filters == true) { + // Record the handle of the last written filter block in the index + FilterEntry& last_entry = filters.front(); + std::string handle_encoding; + last_partition_block_handle.EncodeTo(&handle_encoding); + std::string handle_delta_encoding; + PutVarsignedint64( + &handle_delta_encoding, + last_partition_block_handle.size() - last_encoded_handle_.size()); + last_encoded_handle_ = last_partition_block_handle; + const Slice handle_delta_encoding_slice(handle_delta_encoding); + index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, + &handle_delta_encoding_slice); + if (!p_index_builder_->seperator_is_key_plus_seq()) { + index_on_filter_block_builder_without_seq_.Add( + ExtractUserKey(last_entry.key), handle_encoding, + &handle_delta_encoding_slice); + } + filters.pop_front(); + } else { + MaybeCutAFilterBlock(nullptr); + } + // If there is no filter partition left, then return the index on filter + // partitions + if (UNLIKELY(filters.empty())) { + *status = Status::OK(); + if (finishing_filters) { + if (p_index_builder_->seperator_is_key_plus_seq()) { + return index_on_filter_block_builder_.Finish(); + } else { + return index_on_filter_block_builder_without_seq_.Finish(); + } + } else { + // This is the rare case where no key was added to the filter + return Slice(); + } + } else { + // Return the next filter partition in line and set Incomplete() status to + // indicate we expect more calls to Finish + *status = Status::Incomplete(); + finishing_filters = true; + return filters.front().filter; + } +} + +PartitionedFilterBlockReader::PartitionedFilterBlockReader( + const BlockBasedTable* t, CachableEntry&& filter_block) + : FilterBlockReaderCommon(t, std::move(filter_block)) {} + +std::unique_ptr PartitionedFilterBlockReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + + CachableEntry filter_block; + if (prefetch || !use_cache) { + const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(), + use_cache, nullptr /* get_context */, + lookup_context, &filter_block); + if (!s.ok()) { + return std::unique_ptr(); + } + + if (use_cache && !pin) { + filter_block.Reset(); + } + } + + return std::unique_ptr( + new PartitionedFilterBlockReader(table, std::move(filter_block))); +} + +bool PartitionedFilterBlockReader::KeyMayMatch( + const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context) { + assert(const_ikey_ptr != nullptr); + assert(block_offset == kNotValid); + if (!whole_key_filtering()) { + return true; + } + + return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::KeyMayMatch); +} + +bool PartitionedFilterBlockReader::PrefixMayMatch( + const Slice& prefix, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context) { +#ifdef NDEBUG + (void)block_offset; +#endif + assert(const_ikey_ptr != nullptr); + assert(block_offset == kNotValid); + if (!table_prefix_extractor() && !prefix_extractor) { + return true; + } + + return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr, + get_context, lookup_context, + &FullFilterBlockReader::PrefixMayMatch); +} + +BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( + const CachableEntry& filter_block, const Slice& entry) const { + IndexBlockIter iter; + const InternalKeyComparator* const comparator = internal_comparator(); + Statistics* kNullStats = nullptr; + filter_block.GetValue()->NewIndexIterator( + comparator, comparator->user_comparator(), &iter, kNullStats, + true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); + iter.Seek(entry); + if (UNLIKELY(!iter.Valid())) { + // entry is larger than all the keys. However its prefix might still be + // present in the last partition. If this is called by PrefixMayMatch this + // is necessary for correct behavior. Otherwise it is unnecessary but safe. + // Assuming this is an unlikely case for full key search, the performance + // overhead should be negligible. + iter.SeekToLast(); + } + assert(iter.Valid()); + BlockHandle fltr_blk_handle = iter.value().handle; + return fltr_blk_handle; +} + +Status PartitionedFilterBlockReader::GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const { + assert(table()); + assert(filter_block); + assert(filter_block->IsEmpty()); + + if (!filter_map_.empty()) { + auto iter = filter_map_.find(fltr_blk_handle.offset()); + // This is a possible scenario since block cache might not have had space + // for the partition + if (iter != filter_map_.end()) { + filter_block->SetUnownedValue(iter->second.GetValue()); + return Status::OK(); + } + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + const Status s = + table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, + UncompressionDict::GetEmptyDict(), filter_block, + BlockType::kFilter, get_context, lookup_context, + /* for_compaction */ false, /* use_cache */ true); + + return s; +} + +bool PartitionedFilterBlockReader::MayMatch( + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const { + CachableEntry filter_block; + Status s = + GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block); + if (UNLIKELY(!s.ok())) { + return true; + } + + if (UNLIKELY(filter_block.GetValue()->size() == 0)) { + return true; + } + + auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr); + if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range + return false; + } + + CachableEntry filter_partition_block; + s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, + no_io, get_context, lookup_context, + &filter_partition_block); + if (UNLIKELY(!s.ok())) { + return true; + } + + FullFilterBlockReader filter_partition(table(), + std::move(filter_partition_block)); + return (filter_partition.*filter_function)( + slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context, + lookup_context); +} + +size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { + size_t usage = ApproximateFilterBlockMemoryUsage(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + return usage; + // TODO(myabandeh): better estimation for filter_map_ size +} + +// TODO(myabandeh): merge this with the same function in IndexReader +void PartitionedFilterBlockReader::CacheDependencies(bool pin) { + assert(table()); + + const BlockBasedTable::Rep* const rep = table()->get_rep(); + assert(rep); + + BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; + + CachableEntry filter_block; + + Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &filter_block); + if (!s.ok()) { + ROCKS_LOG_WARN(rep->ioptions.info_log, + "Error retrieving top-level filter block while trying to " + "cache filter partitions: %s", + s.ToString().c_str()); + return; + } + + // Before read partitions, prefetch them to avoid lots of IOs + assert(filter_block.GetValue()); + + IndexBlockIter biter; + const InternalKeyComparator* const comparator = internal_comparator(); + Statistics* kNullStats = nullptr; + filter_block.GetValue()->NewIndexIterator( + comparator, comparator->user_comparator(), &biter, kNullStats, + true /* total_order_seek */, false /* have_first_key */, + index_key_includes_seq(), index_value_is_full()); + // Index partitions are assumed to be consecuitive. Prefetch them all. + // Read the first block offset + biter.SeekToFirst(); + BlockHandle handle = biter.value().handle; + uint64_t prefetch_off = handle.offset(); + + // Read the last block's offset + biter.SeekToLast(); + handle = biter.value().handle; + uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; + uint64_t prefetch_len = last_off - prefetch_off; + std::unique_ptr prefetch_buffer; + + prefetch_buffer.reset(new FilePrefetchBuffer()); + s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off, + static_cast(prefetch_len)); + + // After prefetch, read the partitions one by one + ReadOptions read_options; + for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { + handle = biter.value().handle; + + CachableEntry block; + // TODO: Support counter batch update for partitioned index and + // filter blocks + s = table()->MaybeReadBlockAndLoadToCache( + prefetch_buffer.get(), read_options, handle, + UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter, + nullptr /* get_context */, &lookup_context, nullptr /* contents */); + + assert(s.ok() || block.GetValue() == nullptr); + if (s.ok() && block.GetValue() != nullptr) { + if (block.IsCached()) { + if (pin) { + filter_map_[handle.offset()] = std::move(block); + } + } + } + } +} + +const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator() + const { + assert(table()); + assert(table()->get_rep()); + + return &table()->get_rep()->internal_comparator; +} + +bool PartitionedFilterBlockReader::index_key_includes_seq() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_key_includes_seq; +} + +bool PartitionedFilterBlockReader::index_value_is_full() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->index_value_is_full; +} + +} // namespace rocksdb diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h new file mode 100644 index 00000000000..089773d4751 --- /dev/null +++ b/table/block_based/partitioned_filter_block.h @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include "db/dbformat.h" +#include "index_builder.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/block_based/block.h" +#include "table/block_based/filter_block_reader_common.h" +#include "table/block_based/full_filter_block.h" +#include "util/autovector.h" + +namespace rocksdb { + +class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { + public: + explicit PartitionedFilterBlockBuilder( + const SliceTransform* prefix_extractor, bool whole_key_filtering, + FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, + const bool use_value_delta_encoding, + PartitionedIndexBuilder* const p_index_builder, + const uint32_t partition_size); + + virtual ~PartitionedFilterBlockBuilder(); + + void AddKey(const Slice& key) override; + void Add(const Slice& key) override; + + size_t NumAdded() const override { return num_added_; } + + virtual Slice Finish(const BlockHandle& last_partition_block_handle, + Status* status) override; + + private: + // Filter data + BlockBuilder index_on_filter_block_builder_; // top-level index builder + BlockBuilder + index_on_filter_block_builder_without_seq_; // same for user keys + struct FilterEntry { + std::string key; + Slice filter; + }; + std::list filters; // list of partitioned indexes and their keys + std::unique_ptr value; + std::vector> filter_gc; + bool finishing_filters = + false; // true if Finish is called once but not complete yet. + // The policy of when cut a filter block and Finish it + void MaybeCutAFilterBlock(const Slice* next_key); + // Currently we keep the same number of partitions for filters and indexes. + // This would allow for some potentioal optimizations in future. If such + // optimizations did not realize we can use different number of partitions and + // eliminate p_index_builder_ + PartitionedIndexBuilder* const p_index_builder_; + // The desired number of filters per partition + uint32_t filters_per_partition_; + // The current number of filters in the last partition + uint32_t filters_in_partition_; + // Number of keys added + size_t num_added_; + BlockHandle last_encoded_handle_; +}; + +class PartitionedFilterBlockReader : public FilterBlockReaderCommon { + public: + PartitionedFilterBlockReader(const BlockBasedTable* t, + CachableEntry&& filter_block); + + static std::unique_ptr Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context); + + bool IsBlockBased() override { return false; } + bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + bool PrefixMayMatch(const Slice& prefix, + const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context) override; + + size_t ApproximateMemoryUsage() const override; + + private: + BlockHandle GetFilterPartitionHandle(const CachableEntry& filter_block, + const Slice& entry) const; + Status GetFilterPartitionBlock( + FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle, + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* filter_block) const; + + using FilterFunction = bool (FullFilterBlockReader::*)( + const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, const bool no_io, + const Slice* const const_ikey_ptr, GetContext* get_context, + BlockCacheLookupContext* lookup_context); + bool MayMatch(const Slice& slice, const SliceTransform* prefix_extractor, + uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + FilterFunction filter_function) const; + void CacheDependencies(bool pin) override; + + const InternalKeyComparator* internal_comparator() const; + bool index_key_includes_seq() const; + bool index_value_is_full() const; + + protected: + std::unordered_map> + filter_map_; +}; + +} // namespace rocksdb diff --git a/table/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc similarity index 60% rename from table/partitioned_filter_block_test.cc rename to table/block_based/partitioned_filter_block_test.cc index 8068f14d815..315789f55b7 100644 --- a/table/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -7,46 +7,50 @@ #include "rocksdb/filter_policy.h" -#include "table/full_filter_bits_builder.h" -#include "table/index_builder.h" -#include "table/partitioned_filter_block.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/partitioned_filter_block.h" +#include "table/block_based/filter_policy_internal.h" + +#include "index_builder.h" +#include "logging/logging.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/coding.h" #include "util/hash.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { -std::map slices; +std::map blooms; class MockedBlockBasedTable : public BlockBasedTable { public: - explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) { + MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib) + : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) { // Initialize what Open normally does as much as necessary for the test - rep->cache_key_prefix_size = 10; - } - - CachableEntry GetFilter( - FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, - const bool /* unused */, bool /* unused */, GetContext* /* unused */, - const SliceTransform* prefix_extractor) const override { - Slice slice = slices[filter_blk_handle.offset()]; - auto obj = new FullFilterBlockReader( - prefix_extractor, true, BlockContents(slice), - rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); - return {obj, nullptr}; + rep->index_key_includes_seq = pib->seperator_is_key_plus_seq(); + rep->index_value_is_full = !pib->get_use_value_delta_encoding(); } +}; - FilterBlockReader* ReadFilter( - FilePrefetchBuffer*, const BlockHandle& filter_blk_handle, - const bool /* unused */, - const SliceTransform* prefix_extractor) const override { - Slice slice = slices[filter_blk_handle.offset()]; - auto obj = new FullFilterBlockReader( - prefix_extractor, true, BlockContents(slice), - rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr); - return obj; +class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader { + public: + MyPartitionedFilterBlockReader(BlockBasedTable* t, + CachableEntry&& filter_block) + : PartitionedFilterBlockReader(t, std::move(filter_block)) { + for (const auto& pair : blooms) { + const uint64_t offset = pair.first; + const std::string& bloom = pair.second; + + assert(t); + assert(t->get_rep()); + CachableEntry block( + new ParsedFullFilterBlock( + t->get_rep()->table_options.filter_policy.get(), + BlockContents(Slice(bloom))), + nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + filter_map_[offset] = std::move(block); + } } }; @@ -54,19 +58,26 @@ class PartitionedFilterBlockTest : public testing::Test, virtual public ::testing::WithParamInterface { public: + Options options_; + ImmutableCFOptions ioptions_; + EnvOptions env_options_; BlockBasedTableOptions table_options_; - InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator()); - - PartitionedFilterBlockTest() { - table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); - table_options_.no_block_cache = true; // Otherwise BlockBasedTable::Close - // will access variable that are not - // initialized in our mocked version + InternalKeyComparator icomp_; + std::unique_ptr table_; + std::shared_ptr cache_; + int bits_per_key_; + + PartitionedFilterBlockTest() + : ioptions_(options_), + env_options_(options_), + icomp_(options_.comparator), + bits_per_key_(10) { + table_options_.filter_policy.reset( + NewBloomFilterPolicy(bits_per_key_, false)); table_options_.format_version = GetParam(); table_options_.index_block_restart_interval = 3; } - std::shared_ptr cache_; ~PartitionedFilterBlockTest() override {} const std::string keys[4] = {"afoo", "bar", "box", "hello"}; @@ -83,22 +94,15 @@ class PartitionedFilterBlockTest } uint64_t MaxFilterSize() { - uint32_t dont_care1, dont_care2; int num_keys = sizeof(keys) / sizeof(*keys); - auto filter_bits_reader = dynamic_cast( - table_options_.filter_policy->GetFilterBitsBuilder()); - assert(filter_bits_reader); - auto partition_size = - filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2); - delete filter_bits_reader; - return partition_size + - partition_size * table_options_.block_size_deviation / 100; + // General, rough over-approximation + return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5); } - int last_offset = 10; + uint64_t last_offset = 10; BlockHandle Write(const Slice& slice) { BlockHandle bh(last_offset + 1, slice.size()); - slices[bh.offset()] = slice; + blooms[bh.offset()] = slice.ToString(); last_offset += bh.size(); return bh; } @@ -106,7 +110,7 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* NewIndexBuilder() { const bool kValueDeltaEncoded = true; return PartitionedIndexBuilder::CreateIndexBuilder( - &icomp, !kValueDeltaEncoded, table_options_); + &icomp_, !kValueDeltaEncoded, table_options_); } PartitionedFilterBlockBuilder* NewBuilder( @@ -122,16 +126,14 @@ class PartitionedFilterBlockTest const bool kValueDeltaEncoded = true; return new PartitionedFilterBlockBuilder( prefix_extractor, table_options_.whole_key_filtering, - table_options_.filter_policy->GetFilterBitsBuilder(), + BloomFilterPolicy::GetBuilderFromContext( + FilterBuildingContext(table_options_)), table_options_.index_block_restart_interval, !kValueDeltaEncoded, p_index_builder, partition_size); } - std::unique_ptr table; - PartitionedFilterBlockReader* NewReader( - PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib, - const SliceTransform* prefix_extractor) { + PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) { BlockHandle bh; Status status; Slice slice; @@ -139,19 +141,21 @@ class PartitionedFilterBlockTest slice = builder->Finish(bh, &status); bh = Write(slice); } while (status.IsIncomplete()); - const Options options; - const ImmutableCFOptions ioptions(options); - const MutableCFOptions moptions(options); - const EnvOptions env_options; - const bool kSkipFilters = true; - const bool kImmortal = true; - table.reset(new MockedBlockBasedTable( - new BlockBasedTable::Rep(ioptions, env_options, table_options_, icomp, - !kSkipFilters, 0, !kImmortal))); - auto reader = new PartitionedFilterBlockReader( - prefix_extractor, true, BlockContents(slice), nullptr, nullptr, icomp, - table.get(), pib->seperator_is_key_plus_seq(), - !pib->get_use_value_delta_encoding()); + + constexpr bool skip_filters = false; + constexpr int level = 0; + constexpr bool immortal_table = false; + table_.reset(new MockedBlockBasedTable( + new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, + icomp_, skip_filters, level, immortal_table), + pib)); + BlockContents contents(slice); + CachableEntry block( + new Block(std::move(contents), kDisableGlobalSequenceNumber, + 0 /* read_amp_bytes_per_bit */, nullptr), + nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); + auto reader = + new MyPartitionedFilterBlockReader(table_.get(), std::move(block)); return reader; } @@ -159,33 +163,37 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* pib, bool empty = false, const SliceTransform* prefix_extractor = nullptr) { std::unique_ptr reader( - NewReader(builder, pib, prefix_extractor)); + NewReader(builder, pib)); // Querying added keys const bool no_io = true; for (auto key : keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io, - &ikey_slice)); + &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } { // querying a key twice auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid, - !no_io, &ikey_slice)); + ASSERT_TRUE(reader->KeyMayMatch( + keys[0], prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } // querying missing keys for (auto key : missing_keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); if (empty) { - ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, - !no_io, &ikey_slice)); + ASSERT_TRUE(reader->KeyMayMatch( + key, prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } else { // assuming a good hash function - ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, - !no_io, &ikey_slice)); + ASSERT_FALSE(reader->KeyMayMatch( + key, prefix_extractor, kNotValid, !no_io, &ikey_slice, + /*get_context=*/nullptr, /*lookup_context=*/nullptr)); } } } @@ -321,7 +329,7 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { std::unique_ptr pib(NewIndexBuilder()); std::unique_ptr builder( NewBuilder(pib.get(), prefix_extractor.get())); - const std::string pkeys[3] = {"p-key1", "p-key2", "p-key3"}; + const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"}; builder->Add(pkeys[0]); CutABlock(pib.get(), pkeys[0], pkeys[1]); builder->Add(pkeys[1]); @@ -329,13 +337,62 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { builder->Add(pkeys[2]); CutABlock(pib.get(), pkeys[2]); std::unique_ptr reader( - NewReader(builder.get(), pib.get(), prefix_extractor.get())); + NewReader(builder.get(), pib.get())); for (auto key : pkeys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key), - prefix_extractor.get(), kNotValid, - false /*no_io*/, &ikey_slice)); + ASSERT_TRUE(reader->PrefixMayMatch( + prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid, + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + } + // Non-existent keys but with the same prefix + const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"}; + for (auto key : pnonkeys) { + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->PrefixMayMatch( + prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid, + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); + } +} + +// This reproduces the bug in format_version=3 that the seeking the prefix will +// lead us to the partition before the one that has filter for the prefix. +TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) { + // some small number to cause partition cuts + table_options_.metadata_block_size = 1; + std::unique_ptr prefix_extractor( + rocksdb::NewFixedPrefixTransform(2)); + std::unique_ptr pib(NewIndexBuilder()); + std::unique_ptr builder( + NewBuilder(pib.get(), prefix_extractor.get())); + // In the bug, searching for prefix "p3" on an index with format version 3, + // will give the key "p3" and the partition of the keys that are <= p3, i.e., + // p2-keys, where the filter for prefix "p3" does not exist. + const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3", + "p5-key3"}; + builder->Add(pkeys[0]); + CutABlock(pib.get(), pkeys[0], pkeys[1]); + builder->Add(pkeys[1]); + CutABlock(pib.get(), pkeys[1], pkeys[2]); + builder->Add(pkeys[2]); + CutABlock(pib.get(), pkeys[2], pkeys[3]); + builder->Add(pkeys[3]); + CutABlock(pib.get(), pkeys[3], pkeys[4]); + builder->Add(pkeys[4]); + CutABlock(pib.get(), pkeys[4]); + std::unique_ptr reader( + NewReader(builder.get(), pib.get())); + for (auto key : pkeys) { + auto prefix = prefix_extractor->Transform(key); + auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue); + const Slice ikey_slice = Slice(*ikey.rep()); + ASSERT_TRUE(reader->PrefixMayMatch( + prefix, prefix_extractor.get(), kNotValid, + /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, + /*lookup_context=*/nullptr)); } } diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc new file mode 100644 index 00000000000..10ceab16f22 --- /dev/null +++ b/table/block_based/uncompression_dict_reader.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "table/block_based/uncompression_dict_reader.h" +#include "monitoring/perf_context_imp.h" +#include "table/block_based/block_based_table_reader.h" +#include "util/compression.h" + +namespace rocksdb { + +Status UncompressionDictReader::Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* uncompression_dict_reader) { + assert(table); + assert(table->get_rep()); + assert(!pin || prefetch); + assert(uncompression_dict_reader); + + CachableEntry uncompression_dict; + if (prefetch || !use_cache) { + const Status s = ReadUncompressionDictionary( + table, prefetch_buffer, ReadOptions(), use_cache, + nullptr /* get_context */, lookup_context, &uncompression_dict); + if (!s.ok()) { + return s; + } + + if (use_cache && !pin) { + uncompression_dict.Reset(); + } + } + + uncompression_dict_reader->reset( + new UncompressionDictReader(table, std::move(uncompression_dict))); + + return Status::OK(); +} + +Status UncompressionDictReader::ReadUncompressionDictionary( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict) { + // TODO: add perf counter for compression dictionary read time + + assert(table); + assert(uncompression_dict); + assert(uncompression_dict->IsEmpty()); + + const BlockBasedTable::Rep* const rep = table->get_rep(); + assert(rep); + assert(!rep->compression_dict_handle.IsNull()); + + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->compression_dict_handle, + UncompressionDict::GetEmptyDict(), uncompression_dict, + BlockType::kCompressionDictionary, get_context, lookup_context, + /* for_compaction */ false, use_cache); + + if (!s.ok()) { + ROCKS_LOG_WARN( + rep->ioptions.info_log, + "Encountered error while reading data from compression dictionary " + "block %s", + s.ToString().c_str()); + } + + return s; +} + +Status UncompressionDictReader::GetOrReadUncompressionDictionary( + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict) const { + assert(uncompression_dict); + + if (!uncompression_dict_.IsEmpty()) { + uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue()); + return Status::OK(); + } + + ReadOptions read_options; + if (no_io) { + read_options.read_tier = kBlockCacheTier; + } + + return ReadUncompressionDictionary(table_, prefetch_buffer, read_options, + cache_dictionary_blocks(), get_context, + lookup_context, uncompression_dict); +} + +size_t UncompressionDictReader::ApproximateMemoryUsage() const { + assert(!uncompression_dict_.GetOwnValue() || + uncompression_dict_.GetValue() != nullptr); + size_t usage = uncompression_dict_.GetOwnValue() + ? uncompression_dict_.GetValue()->ApproximateMemoryUsage() + : 0; + +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + + return usage; +} + +bool UncompressionDictReader::cache_dictionary_blocks() const { + assert(table_); + assert(table_->get_rep()); + + return table_->get_rep()->table_options.cache_index_and_filter_blocks; +} + +} // namespace rocksdb diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h new file mode 100644 index 00000000000..bfaf0b4bc70 --- /dev/null +++ b/table/block_based/uncompression_dict_reader.h @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include "table/block_based/cachable_entry.h" +#include "table/format.h" + +namespace rocksdb { + +class BlockBasedTable; +struct BlockCacheLookupContext; +class FilePrefetchBuffer; +class GetContext; +struct ReadOptions; +struct UncompressionDict; + +// Provides access to the uncompression dictionary regardless of whether +// it is owned by the reader or stored in the cache, or whether it is pinned +// in the cache or not. +class UncompressionDictReader { + public: + static Status Create( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + bool use_cache, bool prefetch, bool pin, + BlockCacheLookupContext* lookup_context, + std::unique_ptr* uncompression_dict_reader); + + Status GetOrReadUncompressionDictionary( + FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict) const; + + size_t ApproximateMemoryUsage() const; + + private: + UncompressionDictReader(const BlockBasedTable* t, + CachableEntry&& uncompression_dict) + : table_(t), uncompression_dict_(std::move(uncompression_dict)) { + assert(table_); + } + + bool cache_dictionary_blocks() const; + + static Status ReadUncompressionDictionary( + const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, + const ReadOptions& read_options, bool use_cache, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + CachableEntry* uncompression_dict); + + const BlockBasedTable* table_; + CachableEntry uncompression_dict_; +}; + +} // namespace rocksdb diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc deleted file mode 100644 index 6b352b2f6b0..00000000000 --- a/table/block_based_filter_block_test.cc +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/block_based_filter_block.h" - -#include "rocksdb/filter_policy.h" -#include "util/coding.h" -#include "util/hash.h" -#include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -// For testing: emit an array with one hash value per key -class TestHashFilter : public FilterPolicy { - public: - const char* Name() const override { return "TestHashFilter"; } - - void CreateFilter(const Slice* keys, int n, std::string* dst) const override { - for (int i = 0; i < n; i++) { - uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); - PutFixed32(dst, h); - } - } - - bool KeyMayMatch(const Slice& key, const Slice& filter) const override { - uint32_t h = Hash(key.data(), key.size(), 1); - for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { - if (h == DecodeFixed32(filter.data() + i)) { - return true; - } - } - return false; - } -}; - -class FilterBlockTest : public testing::Test { - public: - TestHashFilter policy_; - BlockBasedTableOptions table_options_; - - FilterBlockTest() { - table_options_.filter_policy.reset(new TestHashFilter()); - } -}; - -TEST_F(FilterBlockTest, EmptyBuilder) { - BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - BlockContents block(builder.Finish()); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0})); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100000)); -} - -TEST_F(FilterBlockTest, SingleChunk) { - BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - ASSERT_EQ(0, builder.NumAdded()); - builder.StartBlock(100); - builder.Add("foo"); - builder.Add("bar"); - builder.Add("box"); - builder.StartBlock(200); - builder.Add("box"); - builder.StartBlock(300); - builder.Add("hello"); - ASSERT_EQ(5, builder.NumAdded()); - BlockContents block(builder.Finish()); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100)); - ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 100)); - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 100)); - ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 100)); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100)); - ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr, 100)); - ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr, 100)); -} - -TEST_F(FilterBlockTest, MultiChunk) { - BlockBasedFilterBlockBuilder builder(nullptr, table_options_); - - // First filter - builder.StartBlock(0); - builder.Add("foo"); - builder.StartBlock(2000); - builder.Add("bar"); - - // Second filter - builder.StartBlock(3100); - builder.Add("box"); - - // Third filter is empty - - // Last filter - builder.StartBlock(9000); - builder.Add("box"); - builder.Add("hello"); - - BlockContents block(builder.Finish()); - BlockBasedFilterBlockReader reader(nullptr, table_options_, true, - std::move(block), nullptr); - - // Check first filter - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0})); - ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 2000)); - ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, uint64_t{0})); - ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, uint64_t{0})); - - // Check second filter - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 3100)); - ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 3100)); - - // Check third filter (empty) - ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, 4100)); - ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 4100)); - - // Check last filter - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 9000)); - ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 9000)); - ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 9000)); - ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 9000)); -} - -// Test for block based filter block -// use new interface in FilterPolicy to create filter builder/reader -class BlockBasedFilterBlockTest : public testing::Test { - public: - BlockBasedTableOptions table_options_; - - BlockBasedFilterBlockTest() { - table_options_.filter_policy.reset(NewBloomFilterPolicy(10)); - } - - ~BlockBasedFilterBlockTest() override {} -}; - -TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { - FilterBlockBuilder* builder = - new BlockBasedFilterBlockBuilder(nullptr, table_options_); - BlockContents block(builder->Finish()); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0})); - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100000)); - - delete builder; - delete reader; -} - -TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { - FilterBlockBuilder* builder = - new BlockBasedFilterBlockBuilder(nullptr, table_options_); - builder->StartBlock(100); - builder->Add("foo"); - builder->Add("bar"); - builder->Add("box"); - builder->StartBlock(200); - builder->Add("box"); - builder->StartBlock(300); - builder->Add("hello"); - BlockContents block(builder->Finish()); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100)); - ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 100)); - ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 100)); - ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 100)); - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100)); - ASSERT_TRUE(!reader->KeyMayMatch("missing", nullptr, 100)); - ASSERT_TRUE(!reader->KeyMayMatch("other", nullptr, 100)); - - delete builder; - delete reader; -} - -TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { - FilterBlockBuilder* builder = - new BlockBasedFilterBlockBuilder(nullptr, table_options_); - - // First filter - builder->StartBlock(0); - builder->Add("foo"); - builder->StartBlock(2000); - builder->Add("bar"); - - // Second filter - builder->StartBlock(3100); - builder->Add("box"); - - // Third filter is empty - - // Last filter - builder->StartBlock(9000); - builder->Add("box"); - builder->Add("hello"); - - BlockContents block(builder->Finish()); - FilterBlockReader* reader = new BlockBasedFilterBlockReader( - nullptr, table_options_, true, std::move(block), nullptr); - - // Check first filter - ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0})); - ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 2000)); - ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, uint64_t{0})); - ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, uint64_t{0})); - - // Check second filter - ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 3100)); - ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 3100)); - - // Check third filter (empty) - ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, 4100)); - ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 4100)); - - // Check last filter - ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 9000)); - ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 9000)); - ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 9000)); - ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 9000)); - - delete builder; - delete reader; -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc deleted file mode 100644 index ad088337a19..00000000000 --- a/table/block_based_table_reader.cc +++ /dev/null @@ -1,3700 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "table/block_based_table_reader.h" - -#include -#include -#include -#include -#include -#include - -#include "db/dbformat.h" -#include "db/pinned_iterators_manager.h" - -#include "rocksdb/cache.h" -#include "rocksdb/comparator.h" -#include "rocksdb/env.h" -#include "rocksdb/filter_policy.h" -#include "rocksdb/iterator.h" -#include "rocksdb/options.h" -#include "rocksdb/statistics.h" -#include "rocksdb/table.h" -#include "rocksdb/table_properties.h" - -#include "table/block.h" -#include "table/block_based_filter_block.h" -#include "table/block_based_table_factory.h" -#include "table/block_fetcher.h" -#include "table/block_prefix_index.h" -#include "table/filter_block.h" -#include "table/format.h" -#include "table/full_filter_block.h" -#include "table/get_context.h" -#include "table/internal_iterator.h" -#include "table/meta_blocks.h" -#include "table/multiget_context.h" -#include "table/partitioned_filter_block.h" -#include "table/persistent_cache_helper.h" -#include "table/sst_file_writer_collectors.h" -#include "table/two_level_iterator.h" - -#include "monitoring/perf_context_imp.h" -#include "util/coding.h" -#include "util/crc32c.h" -#include "util/file_reader_writer.h" -#include "util/stop_watch.h" -#include "util/string_util.h" -#include "util/sync_point.h" -#include "util/xxhash.h" - -namespace rocksdb { - -extern const uint64_t kBlockBasedTableMagicNumber; -extern const std::string kHashIndexPrefixesBlock; -extern const std::string kHashIndexPrefixesMetadataBlock; - -typedef BlockBasedTable::IndexReader IndexReader; - -BlockBasedTable::~BlockBasedTable() { - Close(); - delete rep_; -} - -std::atomic BlockBasedTable::next_cache_key_id_(0); - -namespace { -// Read the block identified by "handle" from "file". -// The only relevant option is options.verify_checksums for now. -// On failure return non-OK. -// On success fill *result and return OK - caller owns *result -// @param uncompression_dict Data for presetting the compression library's -// dictionary. -Status ReadBlockFromFile( - RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, - const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - std::unique_ptr* result, const ImmutableCFOptions& ioptions, - bool do_uncompress, bool maybe_compressed, - const UncompressionDict& uncompression_dict, - const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, - size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) { - BlockContents contents; - BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle, - &contents, ioptions, do_uncompress, - maybe_compressed, uncompression_dict, - cache_options, memory_allocator); - Status s = block_fetcher.ReadBlockContents(); - if (s.ok()) { - result->reset(new Block(std::move(contents), global_seqno, - read_amp_bytes_per_bit, ioptions.statistics)); - } - - return s; -} - -inline MemoryAllocator* GetMemoryAllocator( - const BlockBasedTableOptions& table_options) { - return table_options.block_cache.get() - ? table_options.block_cache->memory_allocator() - : nullptr; -} - -inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( - const BlockBasedTableOptions& table_options) { - return table_options.block_cache_compressed.get() - ? table_options.block_cache_compressed->memory_allocator() - : nullptr; -} - -// Delete the resource that is held by the iterator. -template -void DeleteHeldResource(void* arg, void* /*ignored*/) { - delete reinterpret_cast(arg); -} - -// Delete the entry resided in the cache. -template -void DeleteCachedEntry(const Slice& /*key*/, void* value) { - auto entry = reinterpret_cast(value); - delete entry; -} - -void DeleteCachedFilterEntry(const Slice& key, void* value); -void DeleteCachedIndexEntry(const Slice& key, void* value); -void DeleteCachedUncompressionDictEntry(const Slice& key, void* value); - -// Release the cached entry and decrement its ref count. -void ReleaseCachedEntry(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle); -} - -// Release the cached entry and decrement its ref count. -void ForceReleaseCachedEntry(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle, true /* force_erase */); -} - -Slice GetCacheKeyFromOffset(const char* cache_key_prefix, - size_t cache_key_prefix_size, uint64_t offset, - char* cache_key) { - assert(cache_key != nullptr); - assert(cache_key_prefix_size != 0); - assert(cache_key_prefix_size <= BlockBasedTable::kMaxCacheKeyPrefixSize); - memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); - char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset); - return Slice(cache_key, static_cast(end - cache_key)); -} - -Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, - int level, Tickers block_cache_miss_ticker, - Tickers block_cache_hit_ticker, - uint64_t* block_cache_miss_stats, - uint64_t* block_cache_hit_stats, - Statistics* statistics, - GetContext* get_context) { - auto cache_handle = block_cache->Lookup(key, statistics); - if (cache_handle != nullptr) { - PERF_COUNTER_ADD(block_cache_hit_count, 1); - PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, - static_cast(level)); - if (get_context != nullptr) { - // overall cache hit - get_context->get_context_stats_.num_cache_hit++; - // total bytes read from cache - get_context->get_context_stats_.num_cache_bytes_read += - block_cache->GetUsage(cache_handle); - // block-type specific cache hit - (*block_cache_hit_stats)++; - } else { - // overall cache hit - RecordTick(statistics, BLOCK_CACHE_HIT); - // total bytes read from cache - RecordTick(statistics, BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(cache_handle)); - RecordTick(statistics, block_cache_hit_ticker); - } - } else { - PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1, - static_cast(level)); - if (get_context != nullptr) { - // overall cache miss - get_context->get_context_stats_.num_cache_miss++; - // block-type specific cache miss - (*block_cache_miss_stats)++; - } else { - RecordTick(statistics, BLOCK_CACHE_MISS); - RecordTick(statistics, block_cache_miss_ticker); - } - } - - return cache_handle; -} - -// For hash based index, return true if prefix_extractor and -// prefix_extractor_block mismatch, false otherwise. This flag will be used -// as total_order_seek via NewIndexIterator -bool PrefixExtractorChanged(const TableProperties* table_properties, - const SliceTransform* prefix_extractor) { - // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set. - // Turn off hash index in prefix_extractor is not set; if prefix_extractor - // is set but prefix_extractor_block is not set, also disable hash index - if (prefix_extractor == nullptr || table_properties == nullptr || - table_properties->prefix_extractor_name.empty()) { - return true; - } - - // prefix_extractor and prefix_extractor_block are both non-empty - if (table_properties->prefix_extractor_name.compare( - prefix_extractor->Name()) != 0) { - return true; - } else { - return false; - } -} - -} // namespace - -// Index that allows binary search lookup in a two-level index structure. -class PartitionIndexReader : public IndexReader, public Cleanable { - public: - // Read the partition index from the file and create an instance for - // `PartitionIndexReader`. - // On success, index_reader will be populated; otherwise it will remain - // unmodified. - static Status Create(BlockBasedTable* table, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, - const Footer& footer, const BlockHandle& index_handle, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, - IndexReader** index_reader, - const PersistentCacheOptions& cache_options, - const int level, const bool index_key_includes_seq, - const bool index_value_is_full, - MemoryAllocator* memory_allocator) { - std::unique_ptr index_block; - auto s = ReadBlockFromFile( - file, prefetch_buffer, footer, ReadOptions(), index_handle, - &index_block, ioptions, true /* decompress */, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, memory_allocator); - - if (s.ok()) { - *index_reader = new PartitionIndexReader( - table, icomparator, std::move(index_block), ioptions.statistics, - level, index_key_includes_seq, index_value_is_full); - } - - return s; - } - - // return a two-level iterator: first level is on the partition index - InternalIteratorBase* NewIterator( - IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true, - bool fill_cache = true) override { - Statistics* kNullStats = nullptr; - // Filters are already checked before seeking the index - if (!partition_map_.empty()) { - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return NewTwoLevelIterator( - new BlockBasedTable::PartitionedIndexIteratorState( - table_, &partition_map_, index_key_includes_seq_, - index_value_is_full_), - index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_, index_value_is_full_)); - } else { - auto ro = ReadOptions(); - ro.fill_cache = fill_cache; - bool kIsIndex = true; - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return new BlockBasedTableIterator( - table_, ro, *icomparator_, - index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), nullptr, - kNullStats, true, index_key_includes_seq_, index_value_is_full_), - false, true, /* prefix_extractor */ nullptr, kIsIndex, - index_key_includes_seq_, index_value_is_full_); - } - // TODO(myabandeh): Update TwoLevelIterator to be able to make use of - // on-stack BlockIter while the state is on heap. Currentlly it assumes - // the first level iter is always on heap and will attempt to delete it - // in its destructor. - } - - void CacheDependencies(bool pin) override { - // Before read partitions, prefetch them to avoid lots of IOs - auto rep = table_->rep_; - IndexBlockIter biter; - BlockHandle handle; - Statistics* kNullStats = nullptr; - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); - // Index partitions are assumed to be consecuitive. Prefetch them all. - // Read the first block offset - biter.SeekToFirst(); - if (!biter.Valid()) { - // Empty index. - return; - } - handle = biter.value(); - uint64_t prefetch_off = handle.offset(); - - // Read the last block's offset - biter.SeekToLast(); - if (!biter.Valid()) { - // Empty index. - return; - } - handle = biter.value(); - uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; - uint64_t prefetch_len = last_off - prefetch_off; - std::unique_ptr prefetch_buffer; - auto& file = table_->rep_->file; - prefetch_buffer.reset(new FilePrefetchBuffer()); - Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off, - static_cast(prefetch_len)); - - // After prefetch, read the partitions one by one - biter.SeekToFirst(); - auto ro = ReadOptions(); - Cache* block_cache = rep->table_options.block_cache.get(); - for (; biter.Valid(); biter.Next()) { - handle = biter.value(); - BlockBasedTable::CachableEntry block; - const bool is_index = true; - // TODO: Support counter batch update for partitioned index and - // filter blocks - s = table_->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), rep, ro, handle, - UncompressionDict::GetEmptyDict(), &block, is_index, - nullptr /* get_context */); - - assert(s.ok() || block.value == nullptr); - if (s.ok() && block.value != nullptr) { - if (block.cache_handle != nullptr) { - if (pin) { - partition_map_[handle.offset()] = block; - RegisterCleanup(&ReleaseCachedEntry, block_cache, - block.cache_handle); - } else { - block_cache->Release(block.cache_handle); - } - } else { - delete block.value; - } - } - } - } - - size_t size() const override { return index_block_->size(); } - size_t usable_size() const override { return index_block_->usable_size(); } - - size_t ApproximateMemoryUsage() const override { - assert(index_block_); - size_t usage = index_block_->ApproximateMemoryUsage(); -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); -#else - usage += sizeof(*this); -#endif // ROCKSDB_MALLOC_USABLE_SIZE - // TODO(myabandeh): more accurate estimate of partition_map_ mem usage - return usage; - } - - private: - PartitionIndexReader(BlockBasedTable* table, - const InternalKeyComparator* icomparator, - std::unique_ptr&& index_block, Statistics* stats, - const int /*level*/, const bool index_key_includes_seq, - const bool index_value_is_full) - : IndexReader(icomparator, stats), - table_(table), - index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - assert(index_block_ != nullptr); - } - BlockBasedTable* table_; - std::unique_ptr index_block_; - std::unordered_map> - partition_map_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; -}; - -// Index that allows binary search lookup for the first key of each block. -// This class can be viewed as a thin wrapper for `Block` class which already -// supports binary search. -class BinarySearchIndexReader : public IndexReader { - public: - // Read index from the file and create an intance for - // `BinarySearchIndexReader`. - // On success, index_reader will be populated; otherwise it will remain - // unmodified. - static Status Create(RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, - const Footer& footer, const BlockHandle& index_handle, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, - IndexReader** index_reader, - const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq, - const bool index_value_is_full, - MemoryAllocator* memory_allocator) { - std::unique_ptr index_block; - auto s = ReadBlockFromFile( - file, prefetch_buffer, footer, ReadOptions(), index_handle, - &index_block, ioptions, true /* decompress */, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, memory_allocator); - - if (s.ok()) { - *index_reader = new BinarySearchIndexReader( - icomparator, std::move(index_block), ioptions.statistics, - index_key_includes_seq, index_value_is_full); - } - - return s; - } - - InternalIteratorBase* NewIterator( - IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true, - bool /*dont_care*/ = true) override { - Statistics* kNullStats = nullptr; - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), iter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); - } - - size_t size() const override { return index_block_->size(); } - size_t usable_size() const override { return index_block_->usable_size(); } - - size_t ApproximateMemoryUsage() const override { - assert(index_block_); - size_t usage = index_block_->ApproximateMemoryUsage(); -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); -#else - usage += sizeof(*this); -#endif // ROCKSDB_MALLOC_USABLE_SIZE - return usage; - } - - private: - BinarySearchIndexReader(const InternalKeyComparator* icomparator, - std::unique_ptr&& index_block, - Statistics* stats, const bool index_key_includes_seq, - const bool index_value_is_full) - : IndexReader(icomparator, stats), - index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - assert(index_block_ != nullptr); - } - std::unique_ptr index_block_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; -}; - -// Index that leverages an internal hash table to quicken the lookup for a given -// key. -class HashIndexReader : public IndexReader { - public: - static Status Create( - const SliceTransform* hash_key_extractor, const Footer& footer, - RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, - const ImmutableCFOptions& ioptions, - const InternalKeyComparator* icomparator, const BlockHandle& index_handle, - InternalIterator* meta_index_iter, IndexReader** index_reader, - bool /*hash_index_allow_collision*/, - const PersistentCacheOptions& cache_options, - const bool index_key_includes_seq, const bool index_value_is_full, - MemoryAllocator* memory_allocator) { - std::unique_ptr index_block; - auto s = ReadBlockFromFile( - file, prefetch_buffer, footer, ReadOptions(), index_handle, - &index_block, ioptions, true /* decompress */, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, memory_allocator); - - if (!s.ok()) { - return s; - } - - // Note, failure to create prefix hash index does not need to be a - // hard error. We can still fall back to the original binary search index. - // So, Create will succeed regardless, from this point on. - - auto new_index_reader = new HashIndexReader( - icomparator, std::move(index_block), ioptions.statistics, - index_key_includes_seq, index_value_is_full); - *index_reader = new_index_reader; - - // Get prefixes block - BlockHandle prefixes_handle; - s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock, - &prefixes_handle); - if (!s.ok()) { - // TODO: log error - return Status::OK(); - } - - // Get index metadata block - BlockHandle prefixes_meta_handle; - s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock, - &prefixes_meta_handle); - if (!s.ok()) { - // TODO: log error - return Status::OK(); - } - - // Read contents for the blocks - BlockContents prefixes_contents; - BlockFetcher prefixes_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, - &prefixes_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, memory_allocator); - s = prefixes_block_fetcher.ReadBlockContents(); - if (!s.ok()) { - return s; - } - BlockContents prefixes_meta_contents; - BlockFetcher prefixes_meta_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, - &prefixes_meta_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, memory_allocator); - s = prefixes_meta_block_fetcher.ReadBlockContents(); - if (!s.ok()) { - // TODO: log error - return Status::OK(); - } - - BlockPrefixIndex* prefix_index = nullptr; - s = BlockPrefixIndex::Create(hash_key_extractor, prefixes_contents.data, - prefixes_meta_contents.data, &prefix_index); - // TODO: log error - if (s.ok()) { - new_index_reader->prefix_index_.reset(prefix_index); - } - - return Status::OK(); - } - - InternalIteratorBase* NewIterator( - IndexBlockIter* iter = nullptr, bool total_order_seek = true, - bool /*dont_care*/ = true) override { - Statistics* kNullStats = nullptr; - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return index_block_->NewIterator( - icomparator_, icomparator_->user_comparator(), iter, kNullStats, - total_order_seek, index_key_includes_seq_, index_value_is_full_, - false /* block_contents_pinned */, prefix_index_.get()); - } - - size_t size() const override { return index_block_->size(); } - size_t usable_size() const override { return index_block_->usable_size(); } - - size_t ApproximateMemoryUsage() const override { - assert(index_block_); - size_t usage = index_block_->ApproximateMemoryUsage(); - usage += prefixes_contents_.usable_size(); -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); -#else - if (prefix_index_) { - usage += prefix_index_->ApproximateMemoryUsage(); - } - usage += sizeof(*this); -#endif // ROCKSDB_MALLOC_USABLE_SIZE - return usage; - } - - private: - HashIndexReader(const InternalKeyComparator* icomparator, - std::unique_ptr&& index_block, Statistics* stats, - const bool index_key_includes_seq, - const bool index_value_is_full) - : IndexReader(icomparator, stats), - index_block_(std::move(index_block)), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - assert(index_block_ != nullptr); - } - - ~HashIndexReader() override {} - - std::unique_ptr index_block_; - std::unique_ptr prefix_index_; - BlockContents prefixes_contents_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; -}; - -// Helper function to setup the cache key's prefix for the Table. -void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) { - assert(kMaxCacheKeyPrefixSize >= 10); - rep->cache_key_prefix_size = 0; - rep->compressed_cache_key_prefix_size = 0; - if (rep->table_options.block_cache != nullptr) { - GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), - &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); - // Create dummy offset of index reader which is beyond the file size. - rep->dummy_index_reader_offset = - file_size + rep->table_options.block_cache->NewId(); - } - if (rep->table_options.persistent_cache != nullptr) { - GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(), - &rep->persistent_cache_key_prefix[0], - &rep->persistent_cache_key_prefix_size); - } - if (rep->table_options.block_cache_compressed != nullptr) { - GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), - rep->file->file(), &rep->compressed_cache_key_prefix[0], - &rep->compressed_cache_key_prefix_size); - } -} - -void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file, - char* buffer, size_t* size) { - // generate an id from the file - *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); - - // If the prefix wasn't generated or was too long, - // create one from the cache. - if (cc && *size == 0) { - char* end = EncodeVarint64(buffer, cc->NewId()); - *size = static_cast(end - buffer); - } -} - -void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file, - char* buffer, size_t* size) { - // generate an id from the file - *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize); - - // If the prefix wasn't generated or was too long, - // create one from the cache. - if (*size == 0) { - char* end = EncodeVarint64(buffer, cc->NewId()); - *size = static_cast(end - buffer); - } -} - -namespace { -// Return True if table_properties has `user_prop_name` has a `true` value -// or it doesn't contain this property (for backward compatible). -bool IsFeatureSupported(const TableProperties& table_properties, - const std::string& user_prop_name, Logger* info_log) { - auto& props = table_properties.user_collected_properties; - auto pos = props.find(user_prop_name); - // Older version doesn't have this value set. Skip this check. - if (pos != props.end()) { - if (pos->second == kPropFalse) { - return false; - } else if (pos->second != kPropTrue) { - ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s", - user_prop_name.c_str(), pos->second.c_str()); - } - } - return true; -} - -// Caller has to ensure seqno is not nullptr. -Status GetGlobalSequenceNumber(const TableProperties& table_properties, - SequenceNumber largest_seqno, - SequenceNumber* seqno) { - const auto& props = table_properties.user_collected_properties; - const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion); - const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno); - - *seqno = kDisableGlobalSequenceNumber; - if (version_pos == props.end()) { - if (seqno_pos != props.end()) { - std::array msg_buf; - // This is not an external sst file, global_seqno is not supported. - snprintf( - msg_buf.data(), msg_buf.max_size(), - "A non-external sst file have global seqno property with value %s", - seqno_pos->second.c_str()); - return Status::Corruption(msg_buf.data()); - } - return Status::OK(); - } - - uint32_t version = DecodeFixed32(version_pos->second.c_str()); - if (version < 2) { - if (seqno_pos != props.end() || version != 1) { - std::array msg_buf; - // This is a v1 external sst file, global_seqno is not supported. - snprintf(msg_buf.data(), msg_buf.max_size(), - "An external sst file with version %u have global seqno " - "property with value %s", - version, seqno_pos->second.c_str()); - return Status::Corruption(msg_buf.data()); - } - return Status::OK(); - } - - // Since we have a plan to deprecate global_seqno, we do not return failure - // if seqno_pos == props.end(). We rely on version_pos to detect whether the - // SST is external. - SequenceNumber global_seqno(0); - if (seqno_pos != props.end()) { - global_seqno = DecodeFixed64(seqno_pos->second.c_str()); - } - // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno - // to denote it is unknown. - if (largest_seqno < kMaxSequenceNumber) { - if (global_seqno == 0) { - global_seqno = largest_seqno; - } - if (global_seqno != largest_seqno) { - std::array msg_buf; - snprintf( - msg_buf.data(), msg_buf.max_size(), - "An external sst file with version %u have global seqno property " - "with value %s, while largest seqno in the file is %llu", - version, seqno_pos->second.c_str(), - static_cast(largest_seqno)); - return Status::Corruption(msg_buf.data()); - } - } - *seqno = global_seqno; - - if (global_seqno > kMaxSequenceNumber) { - std::array msg_buf; - snprintf(msg_buf.data(), msg_buf.max_size(), - "An external sst file with version %u have global seqno property " - "with value %llu, which is greater than kMaxSequenceNumber", - version, static_cast(global_seqno)); - return Status::Corruption(msg_buf.data()); - } - - return Status::OK(); -} -} // namespace - -Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix, - size_t cache_key_prefix_size, - const BlockHandle& handle, char* cache_key) { - assert(cache_key != nullptr); - assert(cache_key_prefix_size != 0); - assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); - memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); - char* end = - EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); - return Slice(cache_key, static_cast(end - cache_key)); -} - -Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const BlockBasedTableOptions& table_options, - const InternalKeyComparator& internal_comparator, - std::unique_ptr&& file, - uint64_t file_size, - std::unique_ptr* table_reader, - const SliceTransform* prefix_extractor, - const bool prefetch_index_and_filter_in_cache, - const bool skip_filters, const int level, - const bool immortal_table, - const SequenceNumber largest_seqno, - TailPrefetchStats* tail_prefetch_stats) { - table_reader->reset(); - - Status s; - Footer footer; - std::unique_ptr prefetch_buffer; - - // prefetch both index and filters, down to all partitions - const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; - const bool preload_all = !table_options.cache_index_and_filter_blocks; - - s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all, - preload_all, &prefetch_buffer); - - // Read in the following order: - // 1. Footer - // 2. [metaindex block] - // 3. [meta block: properties] - // 4. [meta block: range deletion tombstone] - // 5. [meta block: compression dictionary] - // 6. [meta block: index] - // 7. [meta block: filter] - s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer, - kBlockBasedTableMagicNumber); - if (!s.ok()) { - return s; - } - if (!BlockBasedTableSupportedVersion(footer.version())) { - return Status::Corruption( - "Unknown Footer version. Maybe this file was created with newer " - "version of RocksDB?"); - } - - // We've successfully read the footer. We are ready to serve requests. - // Better not mutate rep_ after the creation. eg. internal_prefix_transform - // raw pointer will be used to create HashIndexReader, whose reset may - // access a dangling pointer. - Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, - internal_comparator, skip_filters, level, - immortal_table); - rep->file = std::move(file); - rep->footer = footer; - rep->index_type = table_options.index_type; - rep->hash_index_allow_collision = table_options.hash_index_allow_collision; - // We need to wrap data with internal_prefix_transform to make sure it can - // handle prefix correctly. - rep->internal_prefix_transform.reset( - new InternalKeySliceTransform(prefix_extractor)); - SetupCacheKeyPrefix(rep, file_size); - std::unique_ptr new_table(new BlockBasedTable(rep)); - - // page cache options - rep->persistent_cache_options = - PersistentCacheOptions(rep->table_options.persistent_cache, - std::string(rep->persistent_cache_key_prefix, - rep->persistent_cache_key_prefix_size), - rep->ioptions.statistics); - - // Read metaindex - std::unique_ptr meta; - std::unique_ptr meta_iter; - s = ReadMetaBlock(rep, prefetch_buffer.get(), &meta, &meta_iter); - if (!s.ok()) { - return s; - } - - s = ReadPropertiesBlock(rep, prefetch_buffer.get(), meta_iter.get(), - largest_seqno); - if (!s.ok()) { - return s; - } - s = ReadRangeDelBlock(rep, prefetch_buffer.get(), meta_iter.get(), - internal_comparator); - if (!s.ok()) { - return s; - } - s = PrefetchIndexAndFilterBlocks(rep, prefetch_buffer.get(), meta_iter.get(), - new_table.get(), prefix_extractor, - prefetch_all, table_options, level, - prefetch_index_and_filter_in_cache); - - if (s.ok()) { - // Update tail prefetch stats - assert(prefetch_buffer.get() != nullptr); - if (tail_prefetch_stats != nullptr) { - assert(prefetch_buffer->min_offset_read() < file_size); - tail_prefetch_stats->RecordEffectiveSize( - static_cast(file_size) - prefetch_buffer->min_offset_read()); - } - - *table_reader = std::move(new_table); - } - - return s; -} - -Status BlockBasedTable::PrefetchTail( - RandomAccessFileReader* file, uint64_t file_size, - TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, - const bool preload_all, - std::unique_ptr* prefetch_buffer) { - size_t tail_prefetch_size = 0; - if (tail_prefetch_stats != nullptr) { - // Multiple threads may get a 0 (no history) when running in parallel, - // but it will get cleared after the first of them finishes. - tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); - } - if (tail_prefetch_size == 0) { - // Before read footer, readahead backwards to prefetch data. Do more - // readahead if we're going to read index/filter. - // TODO: This may incorrectly select small readahead in case partitioned - // index/filter is enabled and top-level partition pinning is enabled. - // That's because we need to issue readahead before we read the properties, - // at which point we don't yet know the index type. - tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; - } - size_t prefetch_off; - size_t prefetch_len; - if (file_size < tail_prefetch_size) { - prefetch_off = 0; - prefetch_len = static_cast(file_size); - } else { - prefetch_off = static_cast(file_size - tail_prefetch_size); - prefetch_len = tail_prefetch_size; - } - TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", - &tail_prefetch_size); - Status s; - // TODO should not have this special logic in the future. - if (!file->use_direct_io()) { - prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true)); - s = file->Prefetch(prefetch_off, prefetch_len); - } else { - prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true)); - s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len); - } - return s; -} - -Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len, - uint32_t expected) { - Status s; - uint32_t actual = 0; - switch (type) { - case kNoChecksum: - break; - case kCRC32c: - expected = crc32c::Unmask(expected); - actual = crc32c::Value(buf, len); - break; - case kxxHash: - actual = XXH32(buf, static_cast(len), 0); - break; - case kxxHash64: - actual = static_cast(XXH64(buf, static_cast(len), 0) & - uint64_t{0xffffffff}); - break; - default: - s = Status::Corruption("unknown checksum type"); - } - if (s.ok() && actual != expected) { - s = Status::Corruption("properties block checksum mismatched"); - } - return s; -} - -Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value, - TableProperties** table_properties) { - assert(table_properties != nullptr); - // If this is an external SST file ingested with write_global_seqno set to - // true, then we expect the checksum mismatch because checksum was written - // by SstFileWriter, but its global seqno in the properties block may have - // been changed during ingestion. In this case, we read the properties - // block, copy it to a memory buffer, change the global seqno to its - // original value, i.e. 0, and verify the checksum again. - BlockHandle props_block_handle; - CacheAllocationPtr tmp_buf; - Status s = ReadProperties(handle_value, rep->file.get(), prefetch_buffer, - rep->footer, rep->ioptions, table_properties, - false /* verify_checksum */, &props_block_handle, - &tmp_buf, false /* compression_type_missing */, - nullptr /* memory_allocator */); - if (s.ok() && tmp_buf) { - const auto seqno_pos_iter = - (*table_properties) - ->properties_offsets.find( - ExternalSstFilePropertyNames::kGlobalSeqno); - size_t block_size = static_cast(props_block_handle.size()); - if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) { - uint64_t global_seqno_offset = seqno_pos_iter->second; - EncodeFixed64( - tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0); - } - uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1); - s = rocksdb::VerifyChecksum(rep->footer.checksum(), tmp_buf.get(), - block_size + 1, value); - } - return s; -} - -Status BlockBasedTable::ReadPropertiesBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - const SequenceNumber largest_seqno) { - bool found_properties_block = true; - Status s; - s = SeekToPropertiesBlock(meta_iter, &found_properties_block); - - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Error when seeking to properties block from file: %s", - s.ToString().c_str()); - } else if (found_properties_block) { - s = meta_iter->status(); - TableProperties* table_properties = nullptr; - if (s.ok()) { - s = ReadProperties( - meta_iter->value(), rep->file.get(), prefetch_buffer, rep->footer, - rep->ioptions, &table_properties, true /* verify_checksum */, - nullptr /* ret_block_handle */, nullptr /* ret_block_contents */, - false /* compression_type_missing */, nullptr /* memory_allocator */); - } - - if (s.IsCorruption()) { - s = TryReadPropertiesWithGlobalSeqno( - rep, prefetch_buffer, meta_iter->value(), &table_properties); - } - std::unique_ptr props_guard; - if (table_properties != nullptr) { - props_guard.reset(table_properties); - } - - if (!s.ok()) { - ROCKS_LOG_WARN(rep->ioptions.info_log, - "Encountered error while reading data from properties " - "block %s", - s.ToString().c_str()); - } else { - assert(table_properties != nullptr); - rep->table_properties.reset(props_guard.release()); - rep->blocks_maybe_compressed = rep->table_properties->compression_name != - CompressionTypeToString(kNoCompression); - rep->blocks_definitely_zstd_compressed = - (rep->table_properties->compression_name == - CompressionTypeToString(kZSTD) || - rep->table_properties->compression_name == - CompressionTypeToString(kZSTDNotFinalCompression)); - } - } else { - ROCKS_LOG_ERROR(rep->ioptions.info_log, - "Cannot find Properties block from file."); - } -#ifndef ROCKSDB_LITE - if (rep->table_properties) { - ParseSliceTransform(rep->table_properties->prefix_extractor_name, - &(rep->table_prefix_extractor)); - } -#endif // ROCKSDB_LITE - - // Read the table properties, if provided. - if (rep->table_properties) { - rep->whole_key_filtering &= - IsFeatureSupported(*(rep->table_properties), - BlockBasedTablePropertyNames::kWholeKeyFiltering, - rep->ioptions.info_log); - rep->prefix_filtering &= IsFeatureSupported( - *(rep->table_properties), - BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log); - - s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno, - &(rep->global_seqno)); - if (!s.ok()) { - ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str()); - } - } - return s; -} - -Status BlockBasedTable::ReadRangeDelBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - const InternalKeyComparator& internal_comparator) { - Status s; - bool found_range_del_block; - BlockHandle range_del_handle; - s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle); - if (!s.ok()) { - ROCKS_LOG_WARN( - rep->ioptions.info_log, - "Error when seeking to range delete tombstones block from file: %s", - s.ToString().c_str()); - } else if (found_range_del_block && !range_del_handle.IsNull()) { - ReadOptions read_options; - std::unique_ptr iter(NewDataBlockIterator( - rep, read_options, range_del_handle, nullptr /* input_iter */, - false /* is_index */, true /* key_includes_seq */, - true /* index_key_is_full */, nullptr /* get_context */, Status(), - prefetch_buffer)); - assert(iter != nullptr); - s = iter->status(); - if (!s.ok()) { - ROCKS_LOG_WARN( - rep->ioptions.info_log, - "Encountered error while reading data from range del block %s", - s.ToString().c_str()); - } else { - rep->fragmented_range_dels = - std::make_shared(std::move(iter), - internal_comparator); - } - } - return s; -} - -Status BlockBasedTable::ReadCompressionDictBlock( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* compression_dict_block) { - assert(compression_dict_block != nullptr); - Status s; - if (!rep->compression_dict_handle.IsNull()) { - std::unique_ptr compression_dict_cont{new BlockContents()}; - PersistentCacheOptions cache_options; - ReadOptions read_options; - read_options.verify_checksums = true; - BlockFetcher compression_block_fetcher( - rep->file.get(), prefetch_buffer, rep->footer, read_options, - rep->compression_dict_handle, compression_dict_cont.get(), - rep->ioptions, false /* decompress */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), cache_options); - s = compression_block_fetcher.ReadBlockContents(); - - if (!s.ok()) { - ROCKS_LOG_WARN( - rep->ioptions.info_log, - "Encountered error while reading data from compression dictionary " - "block %s", - s.ToString().c_str()); - } else { - *compression_dict_block = std::move(compression_dict_cont); - } - } - return s; -} - -Status BlockBasedTable::PrefetchIndexAndFilterBlocks( - Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, - BlockBasedTable* new_table, const SliceTransform* prefix_extractor, - bool prefetch_all, const BlockBasedTableOptions& table_options, - const int level, const bool prefetch_index_and_filter_in_cache) { - Status s; - - // Find filter handle and filter type - if (rep->filter_policy) { - for (auto filter_type : - {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter, - Rep::FilterType::kBlockFilter}) { - std::string prefix; - switch (filter_type) { - case Rep::FilterType::kFullFilter: - prefix = kFullFilterBlockPrefix; - break; - case Rep::FilterType::kPartitionedFilter: - prefix = kPartitionedFilterBlockPrefix; - break; - case Rep::FilterType::kBlockFilter: - prefix = kFilterBlockPrefix; - break; - default: - assert(0); - } - std::string filter_block_key = prefix; - filter_block_key.append(rep->filter_policy->Name()); - if (FindMetaBlock(meta_iter, filter_block_key, &rep->filter_handle) - .ok()) { - rep->filter_type = filter_type; - break; - } - } - } - - { - // Find compression dictionary handle - bool found_compression_dict; - s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict, - &rep->compression_dict_handle); - } - - bool need_upper_bound_check = - PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor); - - BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); - // prefetch the first level of index - const bool prefetch_index = - prefetch_all || - (table_options.pin_top_level_index_and_filter && - index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); - // prefetch the first level of filter - const bool prefetch_filter = - prefetch_all || (table_options.pin_top_level_index_and_filter && - rep->filter_type == Rep::FilterType::kPartitionedFilter); - // Partition fitlers cannot be enabled without partition indexes - assert(!prefetch_filter || prefetch_index); - // pin both index and filters, down to all partitions - const bool pin_all = - rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; - // pin the first level of index - const bool pin_index = - pin_all || (table_options.pin_top_level_index_and_filter && - index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); - // pin the first level of filter - const bool pin_filter = - pin_all || (table_options.pin_top_level_index_and_filter && - rep->filter_type == Rep::FilterType::kPartitionedFilter); - // pre-fetching of blocks is turned on - // Will use block cache for meta-blocks access - // Always prefetch index and filter for level 0 - // TODO(ajkr): also prefetch compression dictionary block - if (table_options.cache_index_and_filter_blocks) { - assert(table_options.block_cache != nullptr); - if (prefetch_index) { - // Hack: Call NewIndexIterator() to implicitly add index to the - // block_cache - CachableEntry index_entry; - // check prefix_extractor match only if hash based index is used - bool disable_prefix_seek = - rep->index_type == BlockBasedTableOptions::kHashSearch && - need_upper_bound_check; - if (s.ok()) { - std::unique_ptr> iter( - new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek, - nullptr, &index_entry)); - s = iter->status(); - } - if (s.ok()) { - // This is the first call to NewIndexIterator() since we're in Open(). - // On success it should give us ownership of the `CachableEntry` by - // populating `index_entry`. - assert(index_entry.value != nullptr); - if (prefetch_all) { - index_entry.value->CacheDependencies(pin_all); - } - if (pin_index) { - rep->index_entry = std::move(index_entry); - } else { - index_entry.Release(table_options.block_cache.get()); - } - } - } - if (s.ok() && prefetch_filter) { - // Hack: Call GetFilter() to implicitly add filter to the block_cache - auto filter_entry = - new_table->GetFilter(rep->table_prefix_extractor.get()); - if (filter_entry.value != nullptr && prefetch_all) { - filter_entry.value->CacheDependencies( - pin_all, rep->table_prefix_extractor.get()); - } - // if pin_filter is true then save it in rep_->filter_entry; it will be - // released in the destructor only, hence it will be pinned in the - // cache while this reader is alive - if (pin_filter) { - rep->filter_entry = filter_entry; - } else { - filter_entry.Release(table_options.block_cache.get()); - } - } - } else { - // If we don't use block cache for meta-block access, we'll pre-load these - // blocks, which will kept in member variables in Rep and with a same life- - // time as this table object. - IndexReader* index_reader = nullptr; - if (s.ok()) { - s = new_table->CreateIndexReader(prefetch_buffer, &index_reader, - meta_iter, level); - } - std::unique_ptr compression_dict_block; - if (s.ok()) { - rep->index_reader.reset(index_reader); - // The partitions of partitioned index are always stored in cache. They - // are hence follow the configuration for pin and prefetch regardless of - // the value of cache_index_and_filter_blocks - if (prefetch_index_and_filter_in_cache || level == 0) { - rep->index_reader->CacheDependencies(pin_all); - } - - // Set filter block - if (rep->filter_policy) { - const bool is_a_filter_partition = true; - auto filter = new_table->ReadFilter(prefetch_buffer, rep->filter_handle, - !is_a_filter_partition, - rep->table_prefix_extractor.get()); - rep->filter.reset(filter); - // Refer to the comment above about paritioned indexes always being - // cached - if (filter && (prefetch_index_and_filter_in_cache || level == 0)) { - filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get()); - } - } - s = ReadCompressionDictBlock(rep, prefetch_buffer, - &compression_dict_block); - } else { - delete index_reader; - } - if (s.ok() && !rep->compression_dict_handle.IsNull()) { - assert(compression_dict_block != nullptr); - // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy - rep->uncompression_dict.reset(new UncompressionDict( - compression_dict_block->data.ToString(), - rep->blocks_definitely_zstd_compressed, rep->ioptions.statistics)); - } - } - return s; -} - -void BlockBasedTable::SetupForCompaction() { - switch (rep_->ioptions.access_hint_on_compaction_start) { - case Options::NONE: - break; - case Options::NORMAL: - rep_->file->file()->Hint(RandomAccessFile::NORMAL); - break; - case Options::SEQUENTIAL: - rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL); - break; - case Options::WILLNEED: - rep_->file->file()->Hint(RandomAccessFile::WILLNEED); - break; - default: - assert(false); - } -} - -std::shared_ptr BlockBasedTable::GetTableProperties() - const { - return rep_->table_properties; -} - -size_t BlockBasedTable::ApproximateMemoryUsage() const { - size_t usage = 0; - if (rep_->filter) { - usage += rep_->filter->ApproximateMemoryUsage(); - } - if (rep_->index_reader) { - usage += rep_->index_reader->ApproximateMemoryUsage(); - } - if (rep_->uncompression_dict) { - usage += rep_->uncompression_dict->ApproximateMemoryUsage(); - } - return usage; -} - -// Load the meta-block from the file. On success, return the loaded meta block -// and its iterator. -Status BlockBasedTable::ReadMetaBlock(Rep* rep, - FilePrefetchBuffer* prefetch_buffer, - std::unique_ptr* meta_block, - std::unique_ptr* iter) { - // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates - // it is an empty block. - std::unique_ptr meta; - Status s = ReadBlockFromFile( - rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(), - rep->footer.metaindex_handle(), &meta, rep->ioptions, - true /* decompress */, true /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), rep->persistent_cache_options, - kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */, - GetMemoryAllocator(rep->table_options)); - - if (!s.ok()) { - ROCKS_LOG_ERROR(rep->ioptions.info_log, - "Encountered error while reading data from properties" - " block %s", - s.ToString().c_str()); - return s; - } - - *meta_block = std::move(meta); - // meta block uses bytewise comparator. - iter->reset(meta_block->get()->NewIterator( - BytewiseComparator(), BytewiseComparator())); - return Status::OK(); -} - -Status BlockBasedTable::GetDataBlockFromCache( - const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, Rep* rep, - const ReadOptions& read_options, - BlockBasedTable::CachableEntry* block, - const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit, - bool is_index, GetContext* get_context) { - Status s; - BlockContents* compressed_block = nullptr; - Cache::Handle* block_cache_compressed_handle = nullptr; - Statistics* statistics = rep->ioptions.statistics; - - // Lookup uncompressed cache first - if (block_cache != nullptr) { - block->cache_handle = GetEntryFromCache( - block_cache, block_cache_key, rep->level, - is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS, - is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, - get_context - ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss - : &get_context->get_context_stats_.num_cache_data_miss) - : nullptr, - get_context - ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit - : &get_context->get_context_stats_.num_cache_data_hit) - : nullptr, - statistics, get_context); - if (block->cache_handle != nullptr) { - block->value = - reinterpret_cast(block_cache->Value(block->cache_handle)); - return s; - } - } - - // If not found, search from the compressed block cache. - assert(block->cache_handle == nullptr && block->value == nullptr); - - if (block_cache_compressed == nullptr) { - return s; - } - - assert(!compressed_block_cache_key.empty()); - block_cache_compressed_handle = - block_cache_compressed->Lookup(compressed_block_cache_key); - // if we found in the compressed cache, then uncompress and insert into - // uncompressed cache - if (block_cache_compressed_handle == nullptr) { - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); - return s; - } - - // found compressed block - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); - compressed_block = reinterpret_cast( - block_cache_compressed->Value(block_cache_compressed_handle)); - CompressionType compression_type = compressed_block->get_compression_type(); - assert(compression_type != kNoCompression); - - // Retrieve the uncompressed contents into a new buffer - BlockContents contents; - UncompressionContext context(compression_type); - UncompressionInfo info(context, uncompression_dict, compression_type); - s = UncompressBlockContents(info, compressed_block->data.data(), - compressed_block->data.size(), &contents, - rep->table_options.format_version, rep->ioptions, - GetMemoryAllocator(rep->table_options)); - - // Insert uncompressed block into block cache - if (s.ok()) { - block->value = - new Block(std::move(contents), rep->get_global_seqno(is_index), - read_amp_bytes_per_bit, - statistics); // uncompressed block - if (block_cache != nullptr && block->value->own_bytes() && - read_options.fill_cache) { - size_t charge = block->value->ApproximateMemoryUsage(); - s = block_cache->Insert(block_cache_key, block->value, charge, - &DeleteCachedEntry, - &(block->cache_handle)); -#ifndef NDEBUG - block_cache->TEST_mark_as_data_block(block_cache_key, charge); -#endif // NDEBUG - if (s.ok()) { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += charge; - } else { - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); - } - if (is_index) { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_index_add++; - get_context->get_context_stats_.num_cache_index_bytes_insert += - charge; - } else { - RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); - } - } else { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_data_add++; - get_context->get_context_stats_.num_cache_data_bytes_insert += - charge; - } else { - RecordTick(statistics, BLOCK_CACHE_DATA_ADD); - RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); - } - } - } else { - RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); - delete block->value; - block->value = nullptr; - } - } - } - - // Release hold on compressed cache entry - block_cache_compressed->Release(block_cache_compressed_handle); - return s; -} - -Status BlockBasedTable::PutDataBlockToCache( - const Slice& block_cache_key, const Slice& compressed_block_cache_key, - Cache* block_cache, Cache* block_cache_compressed, - const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions, - CachableEntry* cached_block, BlockContents* raw_block_contents, - CompressionType raw_block_comp_type, uint32_t format_version, - const UncompressionDict& uncompression_dict, SequenceNumber seq_no, - size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator, - bool is_index, Cache::Priority priority, GetContext* get_context) { - assert(raw_block_comp_type == kNoCompression || - block_cache_compressed != nullptr); - - Status s; - // Retrieve the uncompressed contents into a new buffer - BlockContents uncompressed_block_contents; - Statistics* statistics = ioptions.statistics; - if (raw_block_comp_type != kNoCompression) { - UncompressionContext context(raw_block_comp_type); - UncompressionInfo info(context, uncompression_dict, raw_block_comp_type); - s = UncompressBlockContents(info, raw_block_contents->data.data(), - raw_block_contents->data.size(), - &uncompressed_block_contents, format_version, - ioptions, memory_allocator); - } - if (!s.ok()) { - return s; - } - - if (raw_block_comp_type != kNoCompression) { - cached_block->value = new Block(std::move(uncompressed_block_contents), - seq_no, read_amp_bytes_per_bit, - statistics); // uncompressed block - } else { - cached_block->value = - new Block(std::move(*raw_block_contents), seq_no, - read_amp_bytes_per_bit, ioptions.statistics); - } - - // Insert compressed block into compressed block cache. - // Release the hold on the compressed cache entry immediately. - if (block_cache_compressed != nullptr && - raw_block_comp_type != kNoCompression && raw_block_contents != nullptr && - raw_block_contents->own_bytes()) { -#ifndef NDEBUG - assert(raw_block_contents->is_raw_block); -#endif // NDEBUG - - // We cannot directly put raw_block_contents because this could point to - // an object in the stack. - BlockContents* block_cont_for_comp_cache = - new BlockContents(std::move(*raw_block_contents)); - s = block_cache_compressed->Insert( - compressed_block_cache_key, block_cont_for_comp_cache, - block_cont_for_comp_cache->ApproximateMemoryUsage(), - &DeleteCachedEntry); - if (s.ok()) { - // Avoid the following code to delete this cached block. - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); - } else { - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); - delete block_cont_for_comp_cache; - } - } - - // insert into uncompressed block cache - if (block_cache != nullptr && cached_block->value->own_bytes()) { - size_t charge = cached_block->value->ApproximateMemoryUsage(); - s = block_cache->Insert(block_cache_key, cached_block->value, charge, - &DeleteCachedEntry, - &(cached_block->cache_handle), priority); -#ifndef NDEBUG - block_cache->TEST_mark_as_data_block(block_cache_key, charge); -#endif // NDEBUG - if (s.ok()) { - assert(cached_block->cache_handle != nullptr); - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += charge; - } else { - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); - } - if (is_index) { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_index_add++; - get_context->get_context_stats_.num_cache_index_bytes_insert += - charge; - } else { - RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); - } - } else { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_data_add++; - get_context->get_context_stats_.num_cache_data_bytes_insert += charge; - } else { - RecordTick(statistics, BLOCK_CACHE_DATA_ADD); - RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge); - } - } - assert(reinterpret_cast(block_cache->Value( - cached_block->cache_handle)) == cached_block->value); - } else { - RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); - delete cached_block->value; - cached_block->value = nullptr; - } - } - - return s; -} - -FilterBlockReader* BlockBasedTable::ReadFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle, - const bool is_a_filter_partition, - const SliceTransform* prefix_extractor) const { - auto& rep = rep_; - // TODO: We might want to unify with ReadBlockFromFile() if we start - // requiring checksum verification in Table::Open. - if (rep->filter_type == Rep::FilterType::kNoFilter) { - return nullptr; - } - BlockContents block; - - BlockFetcher block_fetcher( - rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(), - filter_handle, &block, rep->ioptions, false /* decompress */, - false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - rep->persistent_cache_options, GetMemoryAllocator(rep->table_options)); - Status s = block_fetcher.ReadBlockContents(); - - if (!s.ok()) { - // Error reading the block - return nullptr; - } - - assert(rep->filter_policy); - - auto filter_type = rep->filter_type; - if (rep->filter_type == Rep::FilterType::kPartitionedFilter && - is_a_filter_partition) { - filter_type = Rep::FilterType::kFullFilter; - } - - switch (filter_type) { - case Rep::FilterType::kPartitionedFilter: { - return new PartitionedFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->whole_key_filtering, std::move(block), nullptr, - rep->ioptions.statistics, rep->internal_comparator, this, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0); - } - - case Rep::FilterType::kBlockFilter: - return new BlockBasedFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->table_options, rep->whole_key_filtering, std::move(block), - rep->ioptions.statistics); - - case Rep::FilterType::kFullFilter: { - auto filter_bits_reader = - rep->filter_policy->GetFilterBitsReader(block.data); - assert(filter_bits_reader != nullptr); - return new FullFilterBlockReader( - rep->prefix_filtering ? prefix_extractor : nullptr, - rep->whole_key_filtering, std::move(block), filter_bits_reader, - rep->ioptions.statistics); - } - - default: - // filter_type is either kNoFilter (exited the function at the first if), - // or it must be covered in this switch block - assert(false); - return nullptr; - } -} - -BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( - const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer, - bool no_io, GetContext* get_context) const { - const BlockHandle& filter_blk_handle = rep_->filter_handle; - const bool is_a_filter_partition = true; - return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition, - no_io, get_context, prefix_extractor); -} - -BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( - FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle, - const bool is_a_filter_partition, bool no_io, GetContext* get_context, - const SliceTransform* prefix_extractor) const { - // If cache_index_and_filter_blocks is false, filter should be pre-populated. - // We will return rep_->filter anyway. rep_->filter can be nullptr if filter - // read fails at Open() time. We don't want to reload again since it will - // most probably fail again. - if (!is_a_filter_partition && - !rep_->table_options.cache_index_and_filter_blocks) { - return {rep_->filter.get(), nullptr /* cache handle */}; - } - - Cache* block_cache = rep_->table_options.block_cache.get(); - if (rep_->filter_policy == nullptr /* do not use filter */ || - block_cache == nullptr /* no block cache at all */) { - return {nullptr /* filter */, nullptr /* cache handle */}; - } - - if (!is_a_filter_partition && rep_->filter_entry.IsSet()) { - return rep_->filter_entry; - } - - PERF_TIMER_GUARD(read_filter_block_nanos); - - // Fetching from the cache - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - filter_blk_handle, cache_key); - - Statistics* statistics = rep_->ioptions.statistics; - auto cache_handle = GetEntryFromCache( - block_cache, key, rep_->level, BLOCK_CACHE_FILTER_MISS, - BLOCK_CACHE_FILTER_HIT, - get_context ? &get_context->get_context_stats_.num_cache_filter_miss - : nullptr, - get_context ? &get_context->get_context_stats_.num_cache_filter_hit - : nullptr, - statistics, get_context); - - FilterBlockReader* filter = nullptr; - if (cache_handle != nullptr) { - PERF_COUNTER_ADD(block_cache_filter_hit_count, 1); - filter = - reinterpret_cast(block_cache->Value(cache_handle)); - } else if (no_io) { - // Do not invoke any io. - return CachableEntry(); - } else { - filter = ReadFilter(prefetch_buffer, filter_blk_handle, - is_a_filter_partition, prefix_extractor); - if (filter != nullptr) { - size_t usage = filter->ApproximateMemoryUsage(); - Status s = block_cache->Insert( - key, filter, usage, &DeleteCachedFilterEntry, &cache_handle, - rep_->table_options.cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW); - if (s.ok()) { - PERF_COUNTER_ADD(filter_block_read_count, 1); - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += usage; - get_context->get_context_stats_.num_cache_filter_add++; - get_context->get_context_stats_.num_cache_filter_bytes_insert += - usage; - } else { - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage); - RecordTick(statistics, BLOCK_CACHE_FILTER_ADD); - RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage); - } - } else { - RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); - delete filter; - return CachableEntry(); - } - } - } - - return {filter, cache_handle}; -} - -BlockBasedTable::CachableEntry -BlockBasedTable::GetUncompressionDict(Rep* rep, - FilePrefetchBuffer* prefetch_buffer, - bool no_io, GetContext* get_context) { - if (!rep->table_options.cache_index_and_filter_blocks) { - // block cache is either disabled or not used for meta-blocks. In either - // case, BlockBasedTableReader is the owner of the uncompression dictionary. - return {rep->uncompression_dict.get(), nullptr /* cache handle */}; - } - if (rep->compression_dict_handle.IsNull()) { - return {nullptr, nullptr}; - } - char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto cache_key = - GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, - rep->compression_dict_handle, cache_key_buf); - auto cache_handle = GetEntryFromCache( - rep->table_options.block_cache.get(), cache_key, rep->level, - BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT, - get_context - ? &get_context->get_context_stats_.num_cache_compression_dict_miss - : nullptr, - get_context - ? &get_context->get_context_stats_.num_cache_compression_dict_hit - : nullptr, - rep->ioptions.statistics, get_context); - UncompressionDict* dict = nullptr; - if (cache_handle != nullptr) { - dict = reinterpret_cast( - rep->table_options.block_cache->Value(cache_handle)); - } else if (no_io) { - // Do not invoke any io. - } else { - std::unique_ptr compression_dict_block; - Status s = - ReadCompressionDictBlock(rep, prefetch_buffer, &compression_dict_block); - size_t usage = 0; - if (s.ok()) { - assert(compression_dict_block != nullptr); - // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy - dict = new UncompressionDict(compression_dict_block->data.ToString(), - rep->blocks_definitely_zstd_compressed, - rep->ioptions.statistics); - usage = dict->ApproximateMemoryUsage(); - s = rep->table_options.block_cache->Insert( - cache_key, dict, usage, &DeleteCachedUncompressionDictEntry, - &cache_handle, - rep->table_options.cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW); - } - if (s.ok()) { - PERF_COUNTER_ADD(compression_dict_block_read_count, 1); - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += usage; - get_context->get_context_stats_.num_cache_compression_dict_add++; - get_context->get_context_stats_ - .num_cache_compression_dict_bytes_insert += usage; - } else { - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD); - RecordTick(rep->ioptions.statistics, - BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage); - } - } else { - // There should be no way to get here if block cache insertion succeeded. - // Though it is still possible something failed earlier. - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES); - delete dict; - dict = nullptr; - assert(cache_handle == nullptr); - } - } - return {dict, cache_handle}; -} - -// disable_prefix_seek should be set to true when prefix_extractor found in SST -// differs from the one in mutable_cf_options and index type is HashBasedIndex -InternalIteratorBase* BlockBasedTable::NewIndexIterator( - const ReadOptions& read_options, bool disable_prefix_seek, - IndexBlockIter* input_iter, CachableEntry* index_entry, - GetContext* get_context) { - // index reader has already been pre-populated. - if (rep_->index_reader) { - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return rep_->index_reader->NewIterator( - input_iter, read_options.total_order_seek || disable_prefix_seek, - read_options.fill_cache); - } - // we have a pinned index block - if (rep_->index_entry.IsSet()) { - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return rep_->index_entry.value->NewIterator( - input_iter, read_options.total_order_seek || disable_prefix_seek, - read_options.fill_cache); - } - - PERF_TIMER_GUARD(read_index_block_nanos); - - const bool no_io = read_options.read_tier == kBlockCacheTier; - Cache* block_cache = rep_->table_options.block_cache.get(); - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = - GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->dummy_index_reader_offset, cache_key); - Statistics* statistics = rep_->ioptions.statistics; - auto cache_handle = GetEntryFromCache( - block_cache, key, rep_->level, BLOCK_CACHE_INDEX_MISS, - BLOCK_CACHE_INDEX_HIT, - get_context ? &get_context->get_context_stats_.num_cache_index_miss - : nullptr, - get_context ? &get_context->get_context_stats_.num_cache_index_hit - : nullptr, - statistics, get_context); - - if (cache_handle == nullptr && no_io) { - if (input_iter != nullptr) { - input_iter->Invalidate(Status::Incomplete("no blocking io")); - return input_iter; - } else { - return NewErrorInternalIterator( - Status::Incomplete("no blocking io")); - } - } - - IndexReader* index_reader = nullptr; - if (cache_handle != nullptr) { - PERF_COUNTER_ADD(block_cache_index_hit_count, 1); - index_reader = - reinterpret_cast(block_cache->Value(cache_handle)); - } else { - // Create index reader and put it in the cache. - Status s; - TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2"); - s = CreateIndexReader(nullptr /* prefetch_buffer */, &index_reader); - TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1"); - TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3"); - TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4"); - size_t charge = 0; - if (s.ok()) { - assert(index_reader != nullptr); - charge = index_reader->ApproximateMemoryUsage(); - s = block_cache->Insert( - key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle, - rep_->table_options.cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW); - } - - if (s.ok()) { - if (get_context != nullptr) { - get_context->get_context_stats_.num_cache_add++; - get_context->get_context_stats_.num_cache_bytes_write += charge; - } else { - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge); - } - PERF_COUNTER_ADD(index_block_read_count, 1); - RecordTick(statistics, BLOCK_CACHE_INDEX_ADD); - RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge); - } else { - if (index_reader != nullptr) { - delete index_reader; - } - RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); - // make sure if something goes wrong, index_reader shall remain intact. - if (input_iter != nullptr) { - input_iter->Invalidate(s); - return input_iter; - } else { - return NewErrorInternalIterator(s); - } - } - } - - assert(cache_handle); - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - auto* iter = index_reader->NewIterator( - input_iter, read_options.total_order_seek || disable_prefix_seek); - - // the caller would like to take ownership of the index block - // don't call RegisterCleanup() in this case, the caller will take care of it - if (index_entry != nullptr) { - *index_entry = {index_reader, cache_handle}; - } else { - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); - } - - return iter; -} - -// Convert an index iterator value (i.e., an encoded BlockHandle) -// into an iterator over the contents of the corresponding block. -// If input_iter is null, new a iterator -// If input_iter is not null, update this iter and return it -template -TBlockIter* BlockBasedTable::NewDataBlockIterator( - Rep* rep, const ReadOptions& ro, const BlockHandle& handle, - TBlockIter* input_iter, bool is_index, bool key_includes_seq, - bool index_key_is_full, GetContext* get_context, Status s, - FilePrefetchBuffer* prefetch_buffer) { - PERF_TIMER_GUARD(new_table_block_iter_nanos); - - Cache* block_cache = rep->table_options.block_cache.get(); - CachableEntry block; - TBlockIter* iter; - { - const bool no_io = (ro.read_tier == kBlockCacheTier); - auto uncompression_dict_storage = - GetUncompressionDict(rep, prefetch_buffer, no_io, get_context); - const UncompressionDict& uncompression_dict = - uncompression_dict_storage.value == nullptr - ? UncompressionDict::GetEmptyDict() - : *uncompression_dict_storage.value; - if (s.ok()) { - s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle, - uncompression_dict, &block, is_index, - get_context); - } - - if (input_iter != nullptr) { - iter = input_iter; - } else { - iter = new TBlockIter; - } - // Didn't get any data from block caches. - if (s.ok() && block.value == nullptr) { - if (no_io) { - // Could not read from block_cache and can't do IO - iter->Invalidate(Status::Incomplete("no blocking io")); - return iter; - } - std::unique_ptr block_value; - { - StopWatch sw(rep->ioptions.env, rep->ioptions.statistics, - READ_BLOCK_GET_MICROS); - s = ReadBlockFromFile( - rep->file.get(), prefetch_buffer, rep->footer, ro, handle, - &block_value, rep->ioptions, - rep->blocks_maybe_compressed /*do_decompress*/, - rep->blocks_maybe_compressed, uncompression_dict, - rep->persistent_cache_options, - is_index ? kDisableGlobalSequenceNumber : rep->global_seqno, - rep->table_options.read_amp_bytes_per_bit, - GetMemoryAllocator(rep->table_options)); - } - if (s.ok()) { - block.value = block_value.release(); - } - } - // TODO(ajkr): also pin compression dictionary block when - // `pin_l0_filter_and_index_blocks_in_cache == true`. - uncompression_dict_storage.Release(block_cache); - } - - if (s.ok()) { - assert(block.value != nullptr); - const bool kTotalOrderSeek = true; - // Block contents are pinned and it is still pinned after the iterator - // is destroyed as long as cleanup functions are moved to another object, - // when: - // 1. block cache handle is set to be released in cleanup function, or - // 2. it's pointing to immortal source. If own_bytes is true then we are - // not reading data from the original source, whether immortal or not. - // Otherwise, the block is pinned iff the source is immortal. - bool block_contents_pinned = - (block.cache_handle != nullptr || - (!block.value->own_bytes() && rep->immortal_table)); - iter = block.value->NewIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq, - index_key_is_full, block_contents_pinned); - if (block.cache_handle != nullptr) { - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, - block.cache_handle); - } else { - if (!ro.fill_cache && rep->cache_key_prefix_size != 0) { - // insert a dummy record to block cache to track the memory usage - Cache::Handle* cache_handle; - // There are two other types of cache keys: 1) SST cache key added in - // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in - // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate - // from SST cache key(31 bytes), and use non-zero prefix to - // differentiate from `write_buffer_manager` - const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1; - char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length]; - // Prefix: use rep->cache_key_prefix padded by 0s - memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length); - assert(rep->cache_key_prefix_size != 0); - assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix); - memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size); - char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix, - next_cache_key_id_++); - assert(end - cache_key <= - static_cast(kExtraCacheKeyPrefix + kMaxVarint64Length)); - Slice unique_key = - Slice(cache_key, static_cast(end - cache_key)); - s = block_cache->Insert(unique_key, nullptr, - block.value->ApproximateMemoryUsage(), nullptr, - &cache_handle); - if (s.ok()) { - if (cache_handle != nullptr) { - iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, - cache_handle); - } - } - } - iter->RegisterCleanup(&DeleteHeldResource, block.value, nullptr); - } - } else { - assert(block.value == nullptr); - iter->Invalidate(s); - } - return iter; -} - -Status BlockBasedTable::MaybeReadBlockAndLoadToCache( - FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, bool is_index, GetContext* get_context) { - assert(block_entry != nullptr); - const bool no_io = (ro.read_tier == kBlockCacheTier); - Cache* block_cache = rep->table_options.block_cache.get(); - - // No point to cache compressed blocks if it never goes away - Cache* block_cache_compressed = - rep->immortal_table ? nullptr - : rep->table_options.block_cache_compressed.get(); - - // First, try to get the block from the cache - // - // If either block cache is enabled, we'll try to read from it. - Status s; - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - Slice key /* key to the block cache */; - Slice ckey /* key to the compressed block cache */; - if (block_cache != nullptr || block_cache_compressed != nullptr) { - // create key for block cache - if (block_cache != nullptr) { - key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, - handle, cache_key); - } - - if (block_cache_compressed != nullptr) { - ckey = GetCacheKey(rep->compressed_cache_key_prefix, - rep->compressed_cache_key_prefix_size, handle, - compressed_cache_key); - } - - s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - rep, ro, block_entry, uncompression_dict, - rep->table_options.read_amp_bytes_per_bit, - is_index, get_context); - - // Can't find the block from the cache. If I/O is allowed, read from the - // file. - if (block_entry->value == nullptr && !no_io && ro.fill_cache) { - Statistics* statistics = rep->ioptions.statistics; - bool do_decompress = - block_cache_compressed == nullptr && rep->blocks_maybe_compressed; - CompressionType raw_block_comp_type; - BlockContents raw_block_contents; - { - StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); - BlockFetcher block_fetcher( - rep->file.get(), prefetch_buffer, rep->footer, ro, handle, - &raw_block_contents, rep->ioptions, - do_decompress /* do uncompress */, rep->blocks_maybe_compressed, - uncompression_dict, rep->persistent_cache_options, - GetMemoryAllocator(rep->table_options), - GetMemoryAllocatorForCompressedBlock(rep->table_options)); - s = block_fetcher.ReadBlockContents(); - raw_block_comp_type = block_fetcher.get_compression_type(); - } - - if (s.ok()) { - SequenceNumber seq_no = rep->get_global_seqno(is_index); - // If filling cache is allowed and a cache is configured, try to put the - // block to the cache. - s = PutDataBlockToCache( - key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions, - block_entry, &raw_block_contents, raw_block_comp_type, - rep->table_options.format_version, uncompression_dict, seq_no, - rep->table_options.read_amp_bytes_per_bit, - GetMemoryAllocator(rep->table_options), is_index, - is_index && rep->table_options - .cache_index_and_filter_blocks_with_high_priority - ? Cache::Priority::HIGH - : Cache::Priority::LOW, - get_context); - } - } - } - assert(s.ok() || block_entry->value == nullptr); - return s; -} - -BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( - BlockBasedTable* table, - std::unordered_map>* block_map, - bool index_key_includes_seq, bool index_key_is_full) - : table_(table), - block_map_(block_map), - index_key_includes_seq_(index_key_includes_seq), - index_key_is_full_(index_key_is_full) {} - -InternalIteratorBase* -BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( - const BlockHandle& handle) { - // Return a block iterator on the index partition - auto rep = table_->get_rep(); - auto block = block_map_->find(handle.offset()); - // This is a possible scenario since block cache might not have had space - // for the partition - if (block != block_map_->end()) { - PERF_COUNTER_ADD(block_cache_hit_count, 1); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT); - Cache* block_cache = rep->table_options.block_cache.get(); - assert(block_cache); - RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(block->second.cache_handle)); - Statistics* kNullStats = nullptr; - // We don't return pinned datat from index blocks, so no need - // to set `block_contents_pinned`. - return block->second.value->NewIterator( - &rep->internal_comparator, rep->internal_comparator.user_comparator(), - nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_); - } - // Create an empty iterator - return new IndexBlockIter(); -} - -// This will be broken if the user specifies an unusual implementation -// of Options.comparator, or if the user specifies an unusual -// definition of prefixes in BlockBasedTableOptions.filter_policy. -// In particular, we require the following three properties: -// -// 1) key.starts_with(prefix(key)) -// 2) Compare(prefix(key), key) <= 0. -// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 -// -// Otherwise, this method guarantees no I/O will be incurred. -// -// REQUIRES: this method shouldn't be called while the DB lock is held. -bool BlockBasedTable::PrefixMayMatch( - const Slice& internal_key, const ReadOptions& read_options, - const SliceTransform* options_prefix_extractor, - const bool need_upper_bound_check) { - if (!rep_->filter_policy) { - return true; - } - - const SliceTransform* prefix_extractor; - - if (rep_->table_prefix_extractor == nullptr) { - if (need_upper_bound_check) { - return true; - } - prefix_extractor = options_prefix_extractor; - } else { - prefix_extractor = rep_->table_prefix_extractor.get(); - } - auto user_key = ExtractUserKey(internal_key); - if (!prefix_extractor->InDomain(user_key)) { - return true; - } - - bool may_match = true; - Status s; - - // First, try check with full filter - auto filter_entry = GetFilter(prefix_extractor); - FilterBlockReader* filter = filter_entry.value; - bool filter_checked = true; - if (filter != nullptr) { - if (!filter->IsBlockBased()) { - const Slice* const const_ikey_ptr = &internal_key; - may_match = filter->RangeMayExist( - read_options.iterate_upper_bound, user_key, prefix_extractor, - rep_->internal_comparator.user_comparator(), const_ikey_ptr, - &filter_checked, need_upper_bound_check); - } else { - // if prefix_extractor changed for block based filter, skip filter - if (need_upper_bound_check) { - if (!rep_->filter_entry.IsSet()) { - filter_entry.Release(rep_->table_options.block_cache.get()); - } - return true; - } - auto prefix = prefix_extractor->Transform(user_key); - InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); - auto internal_prefix = internal_key_prefix.Encode(); - - // To prevent any io operation in this method, we set `read_tier` to make - // sure we always read index or filter only when they have already been - // loaded to memory. - ReadOptions no_io_read_options; - no_io_read_options.read_tier = kBlockCacheTier; - - // Then, try find it within each block - // we already know prefix_extractor and prefix_extractor_name must match - // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - std::unique_ptr> iiter( - NewIndexIterator(no_io_read_options, - /* need_upper_bound_check */ false)); - iiter->Seek(internal_prefix); - - if (!iiter->Valid()) { - // we're past end of file - // if it's incomplete, it means that we avoided I/O - // and we're not really sure that we're past the end - // of the file - may_match = iiter->status().IsIncomplete(); - } else if ((rep_->table_properties && - rep_->table_properties->index_key_is_user_key - ? iiter->key() - : ExtractUserKey(iiter->key())) - .starts_with(ExtractUserKey(internal_prefix))) { - // we need to check for this subtle case because our only - // guarantee is that "the key is a string >= last key in that data - // block" according to the doc/table_format.txt spec. - // - // Suppose iiter->key() starts with the desired prefix; it is not - // necessarily the case that the corresponding data block will - // contain the prefix, since iiter->key() need not be in the - // block. However, the next data block may contain the prefix, so - // we return true to play it safe. - may_match = true; - } else if (filter->IsBlockBased()) { - // iiter->key() does NOT start with the desired prefix. Because - // Seek() finds the first key that is >= the seek target, this - // means that iiter->key() > prefix. Thus, any data blocks coming - // after the data block corresponding to iiter->key() cannot - // possibly contain the key. Thus, the corresponding data block - // is the only on could potentially contain the prefix. - BlockHandle handle = iiter->value(); - may_match = - filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset()); - } - } - } - - if (filter_checked) { - Statistics* statistics = rep_->ioptions.statistics; - RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); - if (!may_match) { - RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); - } - } - - // if rep_->filter_entry is not set, we should call Release(); otherwise - // don't call, in this case we have a local copy in rep_->filter_entry, - // it's pinned to the cache and will be released in the destructor - if (!rep_->filter_entry.IsSet()) { - filter_entry.Release(rep_->table_options.block_cache.get()); - } - return may_match; -} - -template -void BlockBasedTableIterator::Seek(const Slice& target) { - is_out_of_bound_ = false; - if (!CheckPrefixMayMatch(target)) { - ResetDataIter(); - return; - } - - SavePrevIndexValue(); - - index_iter_->Seek(target); - - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - - InitDataBlock(); - - block_iter_.Seek(target); - - FindKeyForward(); - CheckOutOfBound(); - assert( - !block_iter_.Valid() || - (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) || - (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target), - block_iter_.key()) <= 0)); -} - -template -void BlockBasedTableIterator::SeekForPrev( - const Slice& target) { - is_out_of_bound_ = false; - if (!CheckPrefixMayMatch(target)) { - ResetDataIter(); - return; - } - - SavePrevIndexValue(); - - // Call Seek() rather than SeekForPrev() in the index block, because the - // target data block will likely to contain the position for `target`, the - // same as Seek(), rather than than before. - // For example, if we have three data blocks, each containing two keys: - // [2, 4] [6, 8] [10, 12] - // (the keys in the index block would be [4, 8, 12]) - // and the user calls SeekForPrev(7), we need to go to the second block, - // just like if they call Seek(7). - // The only case where the block is difference is when they seek to a position - // in the boundary. For example, if they SeekForPrev(5), we should go to the - // first block, rather than the second. However, we don't have the information - // to distinguish the two unless we read the second block. In this case, we'll - // end up with reading two blocks. - index_iter_->Seek(target); - - if (!index_iter_->Valid()) { - index_iter_->SeekToLast(); - if (!index_iter_->Valid()) { - ResetDataIter(); - block_iter_points_to_real_block_ = false; - return; - } - } - - InitDataBlock(); - - block_iter_.SeekForPrev(target); - - FindKeyBackward(); - assert(!block_iter_.Valid() || - icomp_.Compare(target, block_iter_.key()) >= 0); -} - -template -void BlockBasedTableIterator::SeekToFirst() { - is_out_of_bound_ = false; - SavePrevIndexValue(); - index_iter_->SeekToFirst(); - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - InitDataBlock(); - block_iter_.SeekToFirst(); - FindKeyForward(); - CheckOutOfBound(); -} - -template -void BlockBasedTableIterator::SeekToLast() { - is_out_of_bound_ = false; - SavePrevIndexValue(); - index_iter_->SeekToLast(); - if (!index_iter_->Valid()) { - ResetDataIter(); - return; - } - InitDataBlock(); - block_iter_.SeekToLast(); - FindKeyBackward(); -} - -template -void BlockBasedTableIterator::Next() { - assert(block_iter_points_to_real_block_); - block_iter_.Next(); - FindKeyForward(); -} - -template -bool BlockBasedTableIterator::NextAndGetResult( - Slice* ret_key) { - Next(); - bool is_valid = Valid(); - if (is_valid) { - *ret_key = key(); - } - return is_valid; -} - -template -void BlockBasedTableIterator::Prev() { - assert(block_iter_points_to_real_block_); - block_iter_.Prev(); - FindKeyBackward(); -} - -// Found that 256 KB readahead size provides the best performance, based on -// experiments, for auto readahead. Experiment data is in PR #3282. -template -const size_t - BlockBasedTableIterator::kMaxAutoReadaheadSize = - 256 * 1024; - -template -void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle = index_iter_->value(); - if (!block_iter_points_to_real_block_ || - data_block_handle.offset() != prev_index_value_.offset() || - // if previous attempt of reading the block missed cache, try again - block_iter_.status().IsIncomplete()) { - if (block_iter_points_to_real_block_) { - ResetDataIter(); - } - auto* rep = table_->get_rep(); - - // Prefetch additional data for range scans (iterators). Enabled only for - // user reads. - // Implicit auto readahead: - // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. - // Explicit user requested readahead: - // Enabled from the very first IO when ReadOptions.readahead_size is set. - if (!for_compaction_) { - if (read_options_.readahead_size == 0) { - // Implicit auto readahead - num_file_reads_++; - if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) { - if (!rep->file->use_direct_io() && - (data_block_handle.offset() + - static_cast(data_block_handle.size()) + - kBlockTrailerSize > - readahead_limit_)) { - // Buffered I/O - // Discarding the return status of Prefetch calls intentionally, as - // we can fallback to reading from disk if Prefetch fails. - rep->file->Prefetch(data_block_handle.offset(), readahead_size_); - readahead_limit_ = static_cast(data_block_handle.offset() + - readahead_size_); - // Keep exponentially increasing readahead size until - // kMaxAutoReadaheadSize. - readahead_size_ = - std::min(kMaxAutoReadaheadSize, readahead_size_ * 2); - } else if (rep->file->use_direct_io() && !prefetch_buffer_) { - // Direct I/O - // Let FilePrefetchBuffer take care of the readahead. - prefetch_buffer_.reset( - new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize, - kMaxAutoReadaheadSize)); - } - } - } else if (!prefetch_buffer_) { - // Explicit user requested readahead - // The actual condition is: - // if (read_options_.readahead_size != 0 && !prefetch_buffer_) - prefetch_buffer_.reset(new FilePrefetchBuffer( - rep->file.get(), read_options_.readahead_size, - read_options_.readahead_size)); - } - } - - Status s; - BlockBasedTable::NewDataBlockIterator( - rep, read_options_, data_block_handle, &block_iter_, is_index_, - key_includes_seq_, index_key_is_full_, - /* get_context */ nullptr, s, prefetch_buffer_.get()); - block_iter_points_to_real_block_ = true; - } -} - -template -void BlockBasedTableIterator::FindBlockForward() { - // TODO the while loop inherits from two-level-iterator. We don't know - // whether a block can be empty so it can be replaced by an "if". - do { - if (!block_iter_.status().ok()) { - return; - } - // Whether next data block is out of upper bound, if there is one. - bool next_block_is_out_of_bound = false; - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_) { - next_block_is_out_of_bound = - (user_comparator_.Compare(*read_options_.iterate_upper_bound, - index_iter_->user_key()) <= 0); - } - ResetDataIter(); - index_iter_->Next(); - if (next_block_is_out_of_bound) { - // The next block is out of bound. No need to read it. - TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); - // We need to make sure this is not the last data block before setting - // is_out_of_bound_, since the index key for the last data block can be - // larger than smallest key of the next file on the same level. - if (index_iter_->Valid()) { - is_out_of_bound_ = true; - } - return; - } - - if (index_iter_->Valid()) { - InitDataBlock(); - block_iter_.SeekToFirst(); - } else { - return; - } - } while (!block_iter_.Valid()); -} - -template -void BlockBasedTableIterator::FindKeyForward() { - assert(!is_out_of_bound_); - - if (!block_iter_.Valid()) { - FindBlockForward(); - } -} - -template -void BlockBasedTableIterator::FindKeyBackward() { - while (!block_iter_.Valid()) { - if (!block_iter_.status().ok()) { - return; - } - - ResetDataIter(); - index_iter_->Prev(); - - if (index_iter_->Valid()) { - InitDataBlock(); - block_iter_.SeekToLast(); - } else { - return; - } - } - - // We could have check lower bound here too, but we opt not to do it for - // code simplicity. -} - -template -void BlockBasedTableIterator::CheckOutOfBound() { - if (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && block_iter_.Valid()) { - is_out_of_bound_ = user_comparator_.Compare( - *read_options_.iterate_upper_bound, user_key()) <= 0; - } -} - -InternalIterator* BlockBasedTable::NewIterator( - const ReadOptions& read_options, const SliceTransform* prefix_extractor, - Arena* arena, bool skip_filters, bool for_compaction) { - bool need_upper_bound_check = - PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); - const bool kIsNotIndex = false; - if (arena == nullptr) { - return new BlockBasedTableIterator( - this, read_options, rep_->internal_comparator, - NewIndexIterator( - read_options, - need_upper_bound_check && - rep_->index_type == BlockBasedTableOptions::kHashSearch), - !skip_filters && !read_options.total_order_seek && - prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, kIsNotIndex, - true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); - } else { - auto* mem = - arena->AllocateAligned(sizeof(BlockBasedTableIterator)); - return new (mem) BlockBasedTableIterator( - this, read_options, rep_->internal_comparator, - NewIndexIterator(read_options, need_upper_bound_check), - !skip_filters && !read_options.total_order_seek && - prefix_extractor != nullptr, - need_upper_bound_check, prefix_extractor, kIsNotIndex, - true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); - } -} - -FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( - const ReadOptions& read_options) { - if (rep_->fragmented_range_dels == nullptr) { - return nullptr; - } - SequenceNumber snapshot = kMaxSequenceNumber; - if (read_options.snapshot != nullptr) { - snapshot = read_options.snapshot->GetSequenceNumber(); - } - return new FragmentedRangeTombstoneIterator( - rep_->fragmented_range_dels, rep_->internal_comparator, snapshot); -} - -bool BlockBasedTable::FullFilterKeyMayMatch( - const ReadOptions& read_options, FilterBlockReader* filter, - const Slice& internal_key, const bool no_io, - const SliceTransform* prefix_extractor) const { - if (filter == nullptr || filter->IsBlockBased()) { - return true; - } - Slice user_key = ExtractUserKey(internal_key); - const Slice* const const_ikey_ptr = &internal_key; - bool may_match = true; - if (filter->whole_key_filtering()) { - may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid, - no_io, const_ikey_ptr); - } else if (!read_options.total_order_seek && prefix_extractor && - rep_->table_properties->prefix_extractor_name.compare( - prefix_extractor->Name()) == 0 && - prefix_extractor->InDomain(user_key) && - !filter->PrefixMayMatch(prefix_extractor->Transform(user_key), - prefix_extractor, kNotValid, false, - const_ikey_ptr)) { - may_match = false; - } - if (may_match) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE); - PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); - } - return may_match; -} - -void BlockBasedTable::FullFilterKeysMayMatch( - const ReadOptions& read_options, FilterBlockReader* filter, - MultiGetRange* range, const bool no_io, - const SliceTransform* prefix_extractor) const { - if (filter == nullptr || filter->IsBlockBased()) { - return; - } - if (filter->whole_key_filtering()) { - filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io); - } else if (!read_options.total_order_seek && prefix_extractor && - rep_->table_properties->prefix_extractor_name.compare( - prefix_extractor->Name()) == 0) { - for (auto iter = range->begin(); iter != range->end(); ++iter) { - Slice user_key = iter->lkey->user_key(); - - if (!prefix_extractor->InDomain(user_key)) { - range->SkipKey(iter); - } - } - filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false); - } -} - -Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, - GetContext* get_context, - const SliceTransform* prefix_extractor, - bool skip_filters) { - assert(key.size() >= 8); // key must be internal key - Status s; - const bool no_io = read_options.read_tier == kBlockCacheTier; - CachableEntry filter_entry; - bool may_match; - FilterBlockReader* filter = nullptr; - { - if (!skip_filters) { - filter_entry = - GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr, - read_options.read_tier == kBlockCacheTier, get_context); - } - filter = filter_entry.value; - - // First check the full filter - // If full filter not useful, Then go into each block - may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io, - prefix_extractor); - } - if (!may_match) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); - PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); - } else { - IndexBlockIter iiter_on_stack; - // if prefix_extractor found in block differs from options, disable - // BlockPrefixIndex. Only do this check when index_type is kHashSearch. - bool need_upper_bound_check = false; - if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { - need_upper_bound_check = PrefixExtractorChanged( - rep_->table_properties.get(), prefix_extractor); - } - auto iiter = - NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, - /* index_entry */ nullptr, get_context); - std::unique_ptr> iiter_unique_ptr; - if (iiter != &iiter_on_stack) { - iiter_unique_ptr.reset(iiter); - } - - bool matched = false; // if such user key mathced a key in SST - bool done = false; - for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - BlockHandle handle = iiter->value(); - - bool not_exist_in_filter = - filter != nullptr && filter->IsBlockBased() == true && - !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor, - handle.offset(), no_io); - - if (not_exist_in_filter) { - // Not found - // TODO: think about interaction with Merge. If a user key cannot - // cross one data block, we should be fine. - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); - PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); - break; - } else { - DataBlockIter biter; - NewDataBlockIterator( - rep_, read_options, iiter->value(), &biter, false, - true /* key_includes_seq */, true /* index_key_is_full */, - get_context); - - if (read_options.read_tier == kBlockCacheTier && - biter.status().IsIncomplete()) { - // couldn't get block from block_cache - // Update Saver.state to Found because we are only looking for - // whether we can guarantee the key is not there when "no_io" is set - get_context->MarkKeyMayExist(); - break; - } - if (!biter.status().ok()) { - s = biter.status(); - break; - } - - bool may_exist = biter.SeekForGet(key); - if (!may_exist) { - // HashSeek cannot find the key this block and the the iter is not - // the end of the block, i.e. cannot be in the following blocks - // either. In this case, the seek_key cannot be found, so we break - // from the top level for-loop. - break; - } - - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } - - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, - biter.IsValuePinned() ? &biter : nullptr)) { - done = true; - break; - } - } - s = biter.status(); - } - if (done) { - // Avoid the extra Next which is expensive in two-level indexes - break; - } - } - if (matched && filter != nullptr && !filter->IsBlockBased()) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); - PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, - rep_->level); - } - if (s.ok()) { - s = iiter->status(); - } - } - - // if rep_->filter_entry is not set, we should call Release(); otherwise - // don't call, in this case we have a local copy in rep_->filter_entry, - // it's pinned to the cache and will be released in the destructor - if (!rep_->filter_entry.IsSet()) { - filter_entry.Release(rep_->table_options.block_cache.get()); - } - return s; -} - -using MultiGetRange = MultiGetContext::Range; -void BlockBasedTable::MultiGet(const ReadOptions& read_options, - const MultiGetRange* mget_range, - const SliceTransform* prefix_extractor, - bool skip_filters) { - const bool no_io = read_options.read_tier == kBlockCacheTier; - CachableEntry filter_entry; - FilterBlockReader* filter = nullptr; - MultiGetRange sst_file_range(*mget_range, mget_range->begin(), - mget_range->end()); - { - if (!skip_filters) { - // TODO: Figure out where the stats should go - filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr, - read_options.read_tier == kBlockCacheTier, - nullptr /*get_context*/); - } - filter = filter_entry.value; - - // First check the full filter - // If full filter not useful, Then go into each block - FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io, - prefix_extractor); - } - if (skip_filters || !sst_file_range.empty()) { - IndexBlockIter iiter_on_stack; - // if prefix_extractor found in block differs from options, disable - // BlockPrefixIndex. Only do this check when index_type is kHashSearch. - bool need_upper_bound_check = false; - if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { - need_upper_bound_check = PrefixExtractorChanged( - rep_->table_properties.get(), prefix_extractor); - } - auto iiter = NewIndexIterator( - read_options, need_upper_bound_check, &iiter_on_stack, - /* index_entry */ nullptr, sst_file_range.begin()->get_context); - std::unique_ptr> iiter_unique_ptr; - if (iiter != &iiter_on_stack) { - iiter_unique_ptr.reset(iiter); - } - - for (auto miter = sst_file_range.begin(); miter != sst_file_range.end(); - ++miter) { - Status s; - GetContext* get_context = miter->get_context; - const Slice& key = miter->ikey; - bool matched = false; // if such user key matched a key in SST - bool done = false; - for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) { - DataBlockIter biter; - NewDataBlockIterator( - rep_, read_options, iiter->value(), &biter, false, - true /* key_includes_seq */, get_context); - - if (read_options.read_tier == kBlockCacheTier && - biter.status().IsIncomplete()) { - // couldn't get block from block_cache - // Update Saver.state to Found because we are only looking for - // whether we can guarantee the key is not there when "no_io" is set - get_context->MarkKeyMayExist(); - break; - } - if (!biter.status().ok()) { - s = biter.status(); - break; - } - - bool may_exist = biter.SeekForGet(key); - if (!may_exist) { - // HashSeek cannot find the key this block and the the iter is not - // the end of the block, i.e. cannot be in the following blocks - // either. In this case, the seek_key cannot be found, so we break - // from the top level for-loop. - break; - } - - // Call the *saver function on each entry/block until it returns false - for (; biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } - - if (!get_context->SaveValue( - parsed_key, biter.value(), &matched, - biter.IsValuePinned() ? &biter : nullptr)) { - done = true; - break; - } - } - s = biter.status(); - if (done) { - // Avoid the extra Next which is expensive in two-level indexes - break; - } - } - if (matched && filter != nullptr && !filter->IsBlockBased()) { - RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE); - PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, - rep_->level); - } - if (s.ok()) { - s = iiter->status(); - } - *(miter->s) = s; - } - } - - // if rep_->filter_entry is not set, we should call Release(); otherwise - // don't call, in this case we have a local copy in rep_->filter_entry, - // it's pinned to the cache and will be released in the destructor - if (!rep_->filter_entry.IsSet()) { - filter_entry.Release(rep_->table_options.block_cache.get()); - } -} - -Status BlockBasedTable::Prefetch(const Slice* const begin, - const Slice* const end) { - auto& comparator = rep_->internal_comparator; - auto user_comparator = comparator.user_comparator(); - // pre-condition - if (begin && end && comparator.Compare(*begin, *end) > 0) { - return Status::InvalidArgument(*begin, *end); - } - - IndexBlockIter iiter_on_stack; - auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack); - std::unique_ptr> iiter_unique_ptr; - if (iiter != &iiter_on_stack) { - iiter_unique_ptr = - std::unique_ptr>(iiter); - } - - if (!iiter->status().ok()) { - // error opening index iterator - return iiter->status(); - } - - // indicates if we are on the last page that need to be pre-fetched - bool prefetching_boundary_page = false; - - for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid(); - iiter->Next()) { - BlockHandle block_handle = iiter->value(); - const bool is_user_key = rep_->table_properties && - rep_->table_properties->index_key_is_user_key > 0; - if (end && - ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) || - (is_user_key && - user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) { - if (prefetching_boundary_page) { - break; - } - - // The index entry represents the last key in the data block. - // We should load this page into memory as well, but no more - prefetching_boundary_page = true; - } - - // Load the block specified by the block_handle into the block cache - DataBlockIter biter; - NewDataBlockIterator(rep_, ReadOptions(), block_handle, - &biter); - - if (!biter.status().ok()) { - // there was an unexpected error while pre-fetching - return biter.status(); - } - } - - return Status::OK(); -} - -Status BlockBasedTable::VerifyChecksum() { - Status s; - // Check Meta blocks - std::unique_ptr meta; - std::unique_ptr meta_iter; - s = ReadMetaBlock(rep_, nullptr /* prefetch buffer */, &meta, &meta_iter); - if (s.ok()) { - s = VerifyChecksumInMetaBlocks(meta_iter.get()); - if (!s.ok()) { - return s; - } - } else { - return s; - } - // Check Data blocks - IndexBlockIter iiter_on_stack; - InternalIteratorBase* iiter = - NewIndexIterator(ReadOptions(), false, &iiter_on_stack); - std::unique_ptr> iiter_unique_ptr; - if (iiter != &iiter_on_stack) { - iiter_unique_ptr = - std::unique_ptr>(iiter); - } - if (!iiter->status().ok()) { - // error opening index iterator - return iiter->status(); - } - s = VerifyChecksumInBlocks(iiter); - return s; -} - -Status BlockBasedTable::VerifyChecksumInBlocks( - InternalIteratorBase* index_iter) { - Status s; - for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { - s = index_iter->status(); - if (!s.ok()) { - break; - } - BlockHandle handle = index_iter->value(); - BlockContents contents; - BlockFetcher block_fetcher( - rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, - ReadOptions(), handle, &contents, rep_->ioptions, - false /* decompress */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); - s = block_fetcher.ReadBlockContents(); - if (!s.ok()) { - break; - } - } - return s; -} - -Status BlockBasedTable::VerifyChecksumInMetaBlocks( - InternalIteratorBase* index_iter) { - Status s; - for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { - s = index_iter->status(); - if (!s.ok()) { - break; - } - BlockHandle handle; - Slice input = index_iter->value(); - s = handle.DecodeFrom(&input); - BlockContents contents; - BlockFetcher block_fetcher( - rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, - ReadOptions(), handle, &contents, rep_->ioptions, - false /* decompress */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); - s = block_fetcher.ReadBlockContents(); - if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) { - TableProperties* table_properties; - s = TryReadPropertiesWithGlobalSeqno(rep_, nullptr /* prefetch_buffer */, - index_iter->value(), - &table_properties); - delete table_properties; - } - if (!s.ok()) { - break; - } - } - return s; -} - -bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, - const Slice& key) { - std::unique_ptr> iiter( - NewIndexIterator(options)); - iiter->Seek(key); - assert(iiter->Valid()); - CachableEntry block; - - BlockHandle handle = iiter->value(); - Cache* block_cache = rep_->table_options.block_cache.get(); - assert(block_cache != nullptr); - - char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - Slice cache_key = - GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, - cache_key_storage); - Slice ckey; - - Status s; - if (!rep_->compression_dict_handle.IsNull()) { - std::unique_ptr compression_dict_block; - s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */, - &compression_dict_block); - if (s.ok()) { - assert(compression_dict_block != nullptr); - UncompressionDict uncompression_dict( - compression_dict_block->data.ToString(), - rep_->blocks_definitely_zstd_compressed); - s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, rep_, - options, &block, uncompression_dict, - 0 /* read_amp_bytes_per_bit */); - } - } else { - s = GetDataBlockFromCache( - cache_key, ckey, block_cache, nullptr, rep_, options, &block, - UncompressionDict::GetEmptyDict(), 0 /* read_amp_bytes_per_bit */); - } - assert(s.ok()); - bool in_cache = block.value != nullptr; - if (in_cache) { - ReleaseCachedEntry(block_cache, block.cache_handle); - } - return in_cache; -} - -BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { - // Some old version of block-based tables don't have index type present in - // table properties. If that's the case we can safely use the kBinarySearch. - BlockBasedTableOptions::IndexType index_type_on_file = - BlockBasedTableOptions::kBinarySearch; - if (rep_->table_properties) { - auto& props = rep_->table_properties->user_collected_properties; - auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); - if (pos != props.end()) { - index_type_on_file = static_cast( - DecodeFixed32(pos->second.c_str())); - // update index_type with the true type - rep_->index_type = index_type_on_file; - } - } - return index_type_on_file; -} - -// REQUIRES: The following fields of rep_ should have already been populated: -// 1. file -// 2. index_handle, -// 3. options -// 4. internal_comparator -// 5. index_type -Status BlockBasedTable::CreateIndexReader( - FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, - InternalIterator* preloaded_meta_index_iter, int level) { - auto index_type_on_file = UpdateIndexType(); - - auto file = rep_->file.get(); - const InternalKeyComparator* icomparator = &rep_->internal_comparator; - const Footer& footer = rep_->footer; - - // kHashSearch requires non-empty prefix_extractor but bypass checking - // prefix_extractor here since we have no access to MutableCFOptions. - // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. - // If prefix_extractor does not match prefix_extractor_name from table - // properties, turn off Hash Index by setting total_order_seek to true - - switch (index_type_on_file) { - case BlockBasedTableOptions::kTwoLevelIndexSearch: { - return PartitionIndexReader::Create( - this, file, prefetch_buffer, footer, footer.index_handle(), - rep_->ioptions, icomparator, index_reader, - rep_->persistent_cache_options, level, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0, - GetMemoryAllocator(rep_->table_options)); - } - case BlockBasedTableOptions::kBinarySearch: { - return BinarySearchIndexReader::Create( - file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions, - icomparator, index_reader, rep_->persistent_cache_options, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0, - GetMemoryAllocator(rep_->table_options)); - } - case BlockBasedTableOptions::kHashSearch: { - std::unique_ptr meta_guard; - std::unique_ptr meta_iter_guard; - auto meta_index_iter = preloaded_meta_index_iter; - if (meta_index_iter == nullptr) { - auto s = - ReadMetaBlock(rep_, prefetch_buffer, &meta_guard, &meta_iter_guard); - if (!s.ok()) { - // we simply fall back to binary search in case there is any - // problem with prefix hash index loading. - ROCKS_LOG_WARN(rep_->ioptions.info_log, - "Unable to read the metaindex block." - " Fall back to binary search index."); - return BinarySearchIndexReader::Create( - file, prefetch_buffer, footer, footer.index_handle(), - rep_->ioptions, icomparator, index_reader, - rep_->persistent_cache_options, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0, - GetMemoryAllocator(rep_->table_options)); - } - meta_index_iter = meta_iter_guard.get(); - } - - return HashIndexReader::Create( - rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer, - rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter, - index_reader, rep_->hash_index_allow_collision, - rep_->persistent_cache_options, - rep_->table_properties == nullptr || - rep_->table_properties->index_key_is_user_key == 0, - rep_->table_properties == nullptr || - rep_->table_properties->index_value_is_delta_encoded == 0, - GetMemoryAllocator(rep_->table_options)); - } - default: { - std::string error_message = - "Unrecognized index type: " + ToString(index_type_on_file); - return Status::InvalidArgument(error_message.c_str()); - } - } -} - -uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { - std::unique_ptr> index_iter( - NewIndexIterator(ReadOptions())); - - index_iter->Seek(key); - uint64_t result; - if (index_iter->Valid()) { - BlockHandle handle = index_iter->value(); - result = handle.offset(); - } else { - // key is past the last key in the file. If table_properties is not - // available, approximate the offset by returning the offset of the - // metaindex block (which is right near the end of the file). - result = 0; - if (rep_->table_properties) { - result = rep_->table_properties->data_size; - } - // table_properties is not present in the table. - if (result == 0) { - result = rep_->footer.metaindex_handle().offset(); - } - } - return result; -} - -bool BlockBasedTable::TEST_filter_block_preloaded() const { - return rep_->filter != nullptr; -} - -bool BlockBasedTable::TEST_index_reader_preloaded() const { - return rep_->index_reader != nullptr; -} - -Status BlockBasedTable::GetKVPairsFromDataBlocks( - std::vector* kv_pair_blocks) { - std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions())); - - Status s = blockhandles_iter->status(); - if (!s.ok()) { - // Cannot read Index Block - return s; - } - - for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); - blockhandles_iter->Next()) { - s = blockhandles_iter->status(); - - if (!s.ok()) { - break; - } - - std::unique_ptr datablock_iter; - datablock_iter.reset(NewDataBlockIterator( - rep_, ReadOptions(), blockhandles_iter->value())); - s = datablock_iter->status(); - - if (!s.ok()) { - // Error reading the block - Skipped - continue; - } - - KVPairBlock kv_pair_block; - for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); - datablock_iter->Next()) { - s = datablock_iter->status(); - if (!s.ok()) { - // Error reading the block - Skipped - break; - } - const Slice& key = datablock_iter->key(); - const Slice& value = datablock_iter->value(); - std::string key_copy = std::string(key.data(), key.size()); - std::string value_copy = std::string(value.data(), value.size()); - - kv_pair_block.push_back( - std::make_pair(std::move(key_copy), std::move(value_copy))); - } - kv_pair_blocks->push_back(std::move(kv_pair_block)); - } - return Status::OK(); -} - -Status BlockBasedTable::DumpTable(WritableFile* out_file, - const SliceTransform* prefix_extractor) { - // Output Footer - out_file->Append( - "Footer Details:\n" - "--------------------------------------\n" - " "); - out_file->Append(rep_->footer.ToString().c_str()); - out_file->Append("\n"); - - // Output MetaIndex - out_file->Append( - "Metaindex Details:\n" - "--------------------------------------\n"); - std::unique_ptr meta; - std::unique_ptr meta_iter; - Status s = - ReadMetaBlock(rep_, nullptr /* prefetch_buffer */, &meta, &meta_iter); - if (s.ok()) { - for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { - s = meta_iter->status(); - if (!s.ok()) { - return s; - } - if (meta_iter->key() == rocksdb::kPropertiesBlock) { - out_file->Append(" Properties block handle: "); - out_file->Append(meta_iter->value().ToString(true).c_str()); - out_file->Append("\n"); - } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) { - out_file->Append(" Compression dictionary block handle: "); - out_file->Append(meta_iter->value().ToString(true).c_str()); - out_file->Append("\n"); - } else if (strstr(meta_iter->key().ToString().c_str(), - "filter.rocksdb.") != nullptr) { - out_file->Append(" Filter block handle: "); - out_file->Append(meta_iter->value().ToString(true).c_str()); - out_file->Append("\n"); - } else if (meta_iter->key() == rocksdb::kRangeDelBlock) { - out_file->Append(" Range deletion block handle: "); - out_file->Append(meta_iter->value().ToString(true).c_str()); - out_file->Append("\n"); - } - } - out_file->Append("\n"); - } else { - return s; - } - - // Output TableProperties - const rocksdb::TableProperties* table_properties; - table_properties = rep_->table_properties.get(); - - if (table_properties != nullptr) { - out_file->Append( - "Table Properties:\n" - "--------------------------------------\n" - " "); - out_file->Append(table_properties->ToString("\n ", ": ").c_str()); - out_file->Append("\n"); - - // Output Filter blocks - if (!rep_->filter && !table_properties->filter_policy_name.empty()) { - // Support only BloomFilter as off now - rocksdb::BlockBasedTableOptions table_options; - table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1)); - if (table_properties->filter_policy_name.compare( - table_options.filter_policy->Name()) == 0) { - std::string filter_block_key = kFilterBlockPrefix; - filter_block_key.append(table_properties->filter_policy_name); - BlockHandle handle; - if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { - BlockContents block; - BlockFetcher block_fetcher( - rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, - ReadOptions(), handle, &block, rep_->ioptions, - false /*decompress*/, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), - rep_->persistent_cache_options); - s = block_fetcher.ReadBlockContents(); - if (!s.ok()) { - rep_->filter.reset(new BlockBasedFilterBlockReader( - prefix_extractor, table_options, - table_options.whole_key_filtering, std::move(block), - rep_->ioptions.statistics)); - } - } - } - } - } - if (rep_->filter) { - out_file->Append( - "Filter Details:\n" - "--------------------------------------\n" - " "); - out_file->Append(rep_->filter->ToString().c_str()); - out_file->Append("\n"); - } - - // Output Index block - s = DumpIndexBlock(out_file); - if (!s.ok()) { - return s; - } - - // Output compression dictionary - if (!rep_->compression_dict_handle.IsNull()) { - std::unique_ptr compression_dict_block; - s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */, - &compression_dict_block); - if (!s.ok()) { - return s; - } - assert(compression_dict_block != nullptr); - auto compression_dict = compression_dict_block->data; - out_file->Append( - "Compression Dictionary:\n" - "--------------------------------------\n"); - out_file->Append(" size (bytes): "); - out_file->Append(rocksdb::ToString(compression_dict.size())); - out_file->Append("\n\n"); - out_file->Append(" HEX "); - out_file->Append(compression_dict.ToString(true).c_str()); - out_file->Append("\n\n"); - } - - // Output range deletions block - auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions()); - if (range_del_iter != nullptr) { - range_del_iter->SeekToFirst(); - if (range_del_iter->Valid()) { - out_file->Append( - "Range deletions:\n" - "--------------------------------------\n" - " "); - for (; range_del_iter->Valid(); range_del_iter->Next()) { - DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file); - } - out_file->Append("\n"); - } - delete range_del_iter; - } - // Output Data blocks - s = DumpDataBlocks(out_file); - - return s; -} - -void BlockBasedTable::Close() { - if (rep_->closed) { - return; - } - - Cache* const cache = rep_->table_options.block_cache.get(); - - rep_->filter_entry.Release(cache); - rep_->index_entry.Release(cache); - - // cleanup index, filter, and compression dictionary blocks - // to avoid accessing dangling pointers - if (!rep_->table_options.no_block_cache) { - char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - - // Get the filter block key - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->filter_handle, cache_key); - cache->Erase(key); - - // Get the index block key - key = GetCacheKeyFromOffset(rep_->cache_key_prefix, - rep_->cache_key_prefix_size, - rep_->dummy_index_reader_offset, cache_key); - cache->Erase(key); - - if (!rep_->compression_dict_handle.IsNull()) { - // Get the compression dictionary block key - key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->compression_dict_handle, cache_key); - cache->Erase(key); - } - } - - rep_->closed = true; -} - -Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { - out_file->Append( - "Index Details:\n" - "--------------------------------------\n"); - std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions())); - Status s = blockhandles_iter->status(); - if (!s.ok()) { - out_file->Append("Can not read Index Block \n\n"); - return s; - } - - out_file->Append(" Block key hex dump: Data block handle\n"); - out_file->Append(" Block key ascii\n\n"); - for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); - blockhandles_iter->Next()) { - s = blockhandles_iter->status(); - if (!s.ok()) { - break; - } - Slice key = blockhandles_iter->key(); - Slice user_key; - InternalKey ikey; - if (rep_->table_properties && - rep_->table_properties->index_key_is_user_key != 0) { - user_key = key; - } else { - ikey.DecodeFrom(key); - user_key = ikey.user_key(); - } - - out_file->Append(" HEX "); - out_file->Append(user_key.ToString(true).c_str()); - out_file->Append(": "); - out_file->Append(blockhandles_iter->value().ToString(true).c_str()); - out_file->Append("\n"); - - std::string str_key = user_key.ToString(); - std::string res_key(""); - char cspace = ' '; - for (size_t i = 0; i < str_key.size(); i++) { - res_key.append(&str_key[i], 1); - res_key.append(1, cspace); - } - out_file->Append(" ASCII "); - out_file->Append(res_key.c_str()); - out_file->Append("\n ------\n"); - } - out_file->Append("\n"); - return Status::OK(); -} - -Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { - std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions())); - Status s = blockhandles_iter->status(); - if (!s.ok()) { - out_file->Append("Can not read Index Block \n\n"); - return s; - } - - uint64_t datablock_size_min = std::numeric_limits::max(); - uint64_t datablock_size_max = 0; - uint64_t datablock_size_sum = 0; - - size_t block_id = 1; - for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); - block_id++, blockhandles_iter->Next()) { - s = blockhandles_iter->status(); - if (!s.ok()) { - break; - } - - BlockHandle bh = blockhandles_iter->value(); - uint64_t datablock_size = bh.size(); - datablock_size_min = std::min(datablock_size_min, datablock_size); - datablock_size_max = std::max(datablock_size_max, datablock_size); - datablock_size_sum += datablock_size; - - out_file->Append("Data Block # "); - out_file->Append(rocksdb::ToString(block_id)); - out_file->Append(" @ "); - out_file->Append(blockhandles_iter->value().ToString(true).c_str()); - out_file->Append("\n"); - out_file->Append("--------------------------------------\n"); - - std::unique_ptr datablock_iter; - datablock_iter.reset(NewDataBlockIterator( - rep_, ReadOptions(), blockhandles_iter->value())); - s = datablock_iter->status(); - - if (!s.ok()) { - out_file->Append("Error reading the block - Skipped \n\n"); - continue; - } - - for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); - datablock_iter->Next()) { - s = datablock_iter->status(); - if (!s.ok()) { - out_file->Append("Error reading the block - Skipped \n"); - break; - } - DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file); - } - out_file->Append("\n"); - } - - uint64_t num_datablocks = block_id - 1; - if (num_datablocks) { - double datablock_size_avg = - static_cast(datablock_size_sum) / num_datablocks; - out_file->Append("Data Block Summary:\n"); - out_file->Append("--------------------------------------"); - out_file->Append("\n # data blocks: "); - out_file->Append(rocksdb::ToString(num_datablocks)); - out_file->Append("\n min data block size: "); - out_file->Append(rocksdb::ToString(datablock_size_min)); - out_file->Append("\n max data block size: "); - out_file->Append(rocksdb::ToString(datablock_size_max)); - out_file->Append("\n avg data block size: "); - out_file->Append(rocksdb::ToString(datablock_size_avg)); - out_file->Append("\n"); - } - - return Status::OK(); -} - -void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value, - WritableFile* out_file) { - InternalKey ikey; - ikey.DecodeFrom(key); - - out_file->Append(" HEX "); - out_file->Append(ikey.user_key().ToString(true).c_str()); - out_file->Append(": "); - out_file->Append(value.ToString(true).c_str()); - out_file->Append("\n"); - - std::string str_key = ikey.user_key().ToString(); - std::string str_value = value.ToString(); - std::string res_key(""), res_value(""); - char cspace = ' '; - for (size_t i = 0; i < str_key.size(); i++) { - if (str_key[i] == '\0') { - res_key.append("\\0", 2); - } else { - res_key.append(&str_key[i], 1); - } - res_key.append(1, cspace); - } - for (size_t i = 0; i < str_value.size(); i++) { - if (str_value[i] == '\0') { - res_value.append("\\0", 2); - } else { - res_value.append(&str_value[i], 1); - } - res_value.append(1, cspace); - } - - out_file->Append(" ASCII "); - out_file->Append(res_key.c_str()); - out_file->Append(": "); - out_file->Append(res_value.c_str()); - out_file->Append("\n ------\n"); -} - -namespace { - -void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) { - FilterBlockReader* filter = reinterpret_cast(value); - if (filter->statistics() != nullptr) { - RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT, - filter->ApproximateMemoryUsage()); - } - delete filter; -} - -void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) { - IndexReader* index_reader = reinterpret_cast(value); - if (index_reader->statistics() != nullptr) { - RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT, - index_reader->ApproximateMemoryUsage()); - } - delete index_reader; -} - -void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) { - UncompressionDict* dict = reinterpret_cast(value); - RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, - dict->ApproximateMemoryUsage()); - delete dict; -} - -} // anonymous namespace - -} // namespace rocksdb diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 1f209210c13..3e9f6ff3f04 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -9,22 +9,20 @@ #include "table/block_fetcher.h" -#include +#include #include +#include "logging/logging.h" +#include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" #include "rocksdb/env.h" -#include "table/block.h" -#include "table/block_based_table_reader.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" #include "table/format.h" #include "table/persistent_cache_helper.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" -#include "util/file_reader_writer.h" -#include "util/logging.h" -#include "util/memory_allocator.h" #include "util/stop_watch.h" #include "util/string_util.h" #include "util/xxhash.h" @@ -93,7 +91,8 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() { if (prefetch_buffer_ != nullptr && prefetch_buffer_->TryReadFromCache( handle_.offset(), - static_cast(handle_.size()) + kBlockTrailerSize, &slice_)) { + static_cast(handle_.size()) + kBlockTrailerSize, &slice_, + for_compaction_)) { block_size_ = static_cast(handle_.size()); CheckBlockChecksum(); if (!status_.ok()) { @@ -217,9 +216,29 @@ Status BlockFetcher::ReadBlockContents() { PERF_TIMER_GUARD(block_read_time); // Actual file read status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize, - &slice_, used_buf_); + &slice_, used_buf_, for_compaction_); } PERF_COUNTER_ADD(block_read_count, 1); + + // TODO: introduce dedicated perf counter for range tombstones + switch (block_type_) { + case BlockType::kFilter: + PERF_COUNTER_ADD(filter_block_read_count, 1); + break; + + case BlockType::kCompressionDictionary: + PERF_COUNTER_ADD(compression_dict_block_read_count, 1); + break; + + case BlockType::kIndex: + PERF_COUNTER_ADD(index_block_read_count, 1); + break; + + // Nothing to do here as we don't have counters for the other types. + default: + break; + } + PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize); if (!status_.ok()) { return status_; diff --git a/table/block_fetcher.h b/table/block_fetcher.h index b5fee941597..f67c974becb 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -8,28 +8,44 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "table/block.h" +#include "memory/memory_allocator.h" +#include "table/block_based/block.h" +#include "table/block_based/block_type.h" #include "table/format.h" -#include "util/memory_allocator.h" namespace rocksdb { + +// Retrieves a single block of a given file. Utilizes the prefetch buffer and/or +// persistent cache provided (if any) to try to avoid reading from the file +// directly. Note that both the prefetch buffer and the persistent cache are +// optional; also, note that the persistent cache may be configured to store either +// compressed or uncompressed blocks. +// +// If the retrieved block is compressed and the do_uncompress flag is set, +// BlockFetcher uncompresses the block (using the uncompression dictionary, +// if provided, to prime the compression algorithm), and returns the resulting +// uncompressed block data. Otherwise, it returns the original block. +// +// Two read options affect the behavior of BlockFetcher: if verify_checksums is +// true, the checksum of the (original) block is checked; if fill_cache is true, +// the block is added to the persistent cache if needed. +// +// Memory for uncompressed and compressed blocks is allocated as needed +// using memory_allocator and memory_allocator_compressed, respectively +// (if provided; otherwise, the default allocator is used). + class BlockFetcher { public: - // Read the block identified by "handle" from "file". - // The only relevant option is options.verify_checksums for now. - // On failure return non-OK. - // On success fill *result and return OK - caller owns *result - // @param uncompression_dict Data for presetting the compression library's - // dictionary. BlockFetcher(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& read_options, const BlockHandle& handle, BlockContents* contents, const ImmutableCFOptions& ioptions, - bool do_uncompress, bool maybe_compressed, + bool do_uncompress, bool maybe_compressed, BlockType block_type, const UncompressionDict& uncompression_dict, const PersistentCacheOptions& cache_options, MemoryAllocator* memory_allocator = nullptr, - MemoryAllocator* memory_allocator_compressed = nullptr) + MemoryAllocator* memory_allocator_compressed = nullptr, + bool for_compaction = false) : file_(file), prefetch_buffer_(prefetch_buffer), footer_(footer), @@ -39,10 +55,13 @@ class BlockFetcher { ioptions_(ioptions), do_uncompress_(do_uncompress), maybe_compressed_(maybe_compressed), + block_type_(block_type), uncompression_dict_(uncompression_dict), cache_options_(cache_options), memory_allocator_(memory_allocator), - memory_allocator_compressed_(memory_allocator_compressed) {} + memory_allocator_compressed_(memory_allocator_compressed), + for_compaction_(for_compaction) {} + Status ReadBlockContents(); CompressionType get_compression_type() const { return compression_type_; } @@ -58,6 +77,7 @@ class BlockFetcher { const ImmutableCFOptions& ioptions_; bool do_uncompress_; bool maybe_compressed_; + BlockType block_type_; const UncompressionDict& uncompression_dict_; const PersistentCacheOptions& cache_options_; MemoryAllocator* memory_allocator_; @@ -71,6 +91,7 @@ class BlockFetcher { char stack_buf_[kDefaultStackBufferSize]; bool got_from_prefetch_buffer_ = false; rocksdb::CompressionType compression_type_; + bool for_compaction_ = false; // return true if found bool TryGetUncompressBlockFromPersistentCache(); diff --git a/table/bloom_block.cc b/table/bloom_block.cc deleted file mode 100644 index 61959030a22..00000000000 --- a/table/bloom_block.cc +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include "table/bloom_block.h" - -#include -#include "rocksdb/slice.h" -#include "util/dynamic_bloom.h" - -namespace rocksdb { - -void BloomBlockBuilder::AddKeysHashes(const std::vector& keys_hashes) { - for (auto hash : keys_hashes) { - bloom_.AddHash(hash); - } -} - -Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } - -const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; -} // namespace rocksdb diff --git a/table/bloom_block.h b/table/bloom_block.h deleted file mode 100644 index 483fa25d93d..00000000000 --- a/table/bloom_block.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -#pragma once - -#include -#include -#include "util/dynamic_bloom.h" - -namespace rocksdb { -class Logger; - -class BloomBlockBuilder { - public: - static const std::string kBloomBlock; - - explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {} - - void SetTotalBits(Allocator* allocator, uint32_t total_bits, - uint32_t locality, size_t huge_page_tlb_size, - Logger* logger) { - bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, - logger); - } - - uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } - - void AddKeysHashes(const std::vector& keys_hashes); - - Slice Finish(); - - private: - DynamicBloom bloom_; -}; - -}; // namespace rocksdb diff --git a/table/cleanable_test.cc b/table/cleanable_test.cc index f18c33b8399..8478adf523d 100644 --- a/table/cleanable_test.cc +++ b/table/cleanable_test.cc @@ -9,8 +9,8 @@ #include "port/stack_trace.h" #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc similarity index 99% rename from table/cuckoo_table_builder.cc rename to table/cuckoo/cuckoo_table_builder.cc index f590e6ad405..8857cf7ea9f 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo/cuckoo_table_builder.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "table/cuckoo_table_builder.h" +#include "table/cuckoo/cuckoo_table_builder.h" #include #include @@ -13,14 +13,14 @@ #include #include "db/dbformat.h" +#include "file/writable_file_writer.h" #include "rocksdb/env.h" #include "rocksdb/table.h" -#include "table/block_builder.h" -#include "table/cuckoo_table_factory.h" +#include "table/block_based/block_builder.h" +#include "table/cuckoo/cuckoo_table_factory.h" #include "table/format.h" #include "table/meta_blocks.h" #include "util/autovector.h" -#include "util/file_reader_writer.h" #include "util/random.h" #include "util/string_util.h" diff --git a/table/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h similarity index 99% rename from table/cuckoo_table_builder.h rename to table/cuckoo/cuckoo_table_builder.h index 3829541b39a..c42744de019 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo/cuckoo_table_builder.h @@ -30,6 +30,9 @@ class CuckooTableBuilder: public TableBuilder { uint64_t), uint32_t column_family_id, const std::string& column_family_name); + // No copying allowed + CuckooTableBuilder(const CuckooTableBuilder&) = delete; + void operator=(const CuckooTableBuilder&) = delete; // REQUIRES: Either Finish() or Abandon() has been called. ~CuckooTableBuilder() {} @@ -116,10 +119,6 @@ class CuckooTableBuilder: public TableBuilder { std::string smallest_user_key_ = ""; bool closed_; // Either Finish() or Abandon() has been called. - - // No copying allowed - CuckooTableBuilder(const CuckooTableBuilder&) = delete; - void operator=(const CuckooTableBuilder&) = delete; }; } // namespace rocksdb diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc similarity index 99% rename from table/cuckoo_table_builder_test.cc rename to table/cuckoo/cuckoo_table_builder_test.cc index c1e350327f3..b84cc9f5bcf 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -10,11 +10,12 @@ #include #include +#include "file/random_access_file_reader.h" +#include "file/writable_file_writer.h" +#include "table/cuckoo/cuckoo_table_builder.h" #include "table/meta_blocks.h" -#include "table/cuckoo_table_builder.h" -#include "util/file_reader_writer.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { extern const uint64_t kCuckooTableMagicNumber; diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo/cuckoo_table_factory.cc similarity index 94% rename from table/cuckoo_table_factory.cc rename to table/cuckoo/cuckoo_table_factory.cc index 74d18d51213..4ca29f364cf 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo/cuckoo_table_factory.cc @@ -4,11 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "table/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_factory.h" #include "db/dbformat.h" -#include "table/cuckoo_table_builder.h" -#include "table/cuckoo_table_reader.h" +#include "table/cuckoo/cuckoo_table_builder.h" +#include "table/cuckoo/cuckoo_table_reader.h" namespace rocksdb { diff --git a/table/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h similarity index 100% rename from table/cuckoo_table_factory.h rename to table/cuckoo/cuckoo_table_factory.h diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc similarity index 98% rename from table/cuckoo_table_reader.cc rename to table/cuckoo/cuckoo_table_reader.cc index f4df2467fdb..982b14763f4 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -8,20 +8,20 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef ROCKSDB_LITE -#include "table/cuckoo_table_reader.h" +#include "table/cuckoo/cuckoo_table_reader.h" #include #include #include #include #include +#include "memory/arena.h" #include "rocksdb/iterator.h" #include "rocksdb/table.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/get_context.h" #include "table/internal_iterator.h" #include "table/meta_blocks.h" -#include "table/cuckoo_table_factory.h" -#include "table/get_context.h" -#include "util/arena.h" #include "util/coding.h" namespace rocksdb { @@ -197,6 +197,9 @@ void CuckooTableReader::Prepare(const Slice& key) { class CuckooTableIterator : public InternalIterator { public: explicit CuckooTableIterator(CuckooTableReader* reader); + // No copying allowed + CuckooTableIterator(const CuckooTableIterator&) = delete; + void operator=(const Iterator&) = delete; ~CuckooTableIterator() override {} bool Valid() const override; void SeekToFirst() override; @@ -248,9 +251,6 @@ class CuckooTableIterator : public InternalIterator { uint32_t curr_key_idx_; Slice curr_value_; IterKey curr_key_; - // No copying allowed - CuckooTableIterator(const CuckooTableIterator&) = delete; - void operator=(const Iterator&) = delete; }; CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) @@ -377,7 +377,8 @@ Slice CuckooTableIterator::value() const { InternalIterator* CuckooTableReader::NewIterator( const ReadOptions& /*read_options*/, const SliceTransform* /* prefix_extractor */, Arena* arena, - bool /*skip_filters*/, bool /*for_compaction*/) { + bool /*skip_filters*/, TableReaderCaller /*caller*/, + size_t /*compaction_readahead_size*/) { if (!status().ok()) { return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); diff --git a/table/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h similarity index 79% rename from table/cuckoo_table_reader.h rename to table/cuckoo/cuckoo_table_reader.h index b37d46373e1..fb0445bcfb3 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -15,11 +15,11 @@ #include #include "db/dbformat.h" +#include "file/random_access_file_reader.h" #include "options/cf_options.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "table/table_reader.h" -#include "util/file_reader_writer.h" namespace rocksdb { @@ -45,18 +45,30 @@ class CuckooTableReader: public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; + // Returns a new iterator over table contents + // compaction_readahead_size: its value will only be used if for_compaction = + // true InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false) override; + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0) override; void Prepare(const Slice& target) override; // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const override; // Following methods are not implemented for Cuckoo Table Reader - uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; } + uint64_t ApproximateOffsetOf(const Slice& /*key*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + TableReaderCaller /*caller*/) override { + return 0; + } + void SetupForCompaction() override {} // End of methods not implemented. diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc similarity index 95% rename from table/cuckoo_table_reader_test.cc rename to table/cuckoo/cuckoo_table_reader_test.cc index 74fb52e6c78..2dfe887ba81 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -13,26 +13,22 @@ int main() { } #else -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include -#include -#include +#include #include +#include +#include -#include "table/cuckoo_table_builder.h" -#include "table/cuckoo_table_factory.h" -#include "table/cuckoo_table_reader.h" +#include "memory/arena.h" +#include "table/cuckoo/cuckoo_table_builder.h" +#include "table/cuckoo/cuckoo_table_factory.h" +#include "table/cuckoo/cuckoo_table_reader.h" #include "table/get_context.h" #include "table/meta_blocks.h" -#include "util/arena.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/gflags_compat.h" #include "util/random.h" #include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::SetUsageMessage; @@ -126,7 +122,7 @@ class CuckooReaderTest : public testing::Test { PinnableSlice value; GetContext get_context(ucomp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(user_keys[i]), &value, - nullptr, nullptr, nullptr, nullptr); + nullptr, nullptr, true, nullptr, nullptr); ASSERT_OK( reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr)); ASSERT_STREQ(values[i].c_str(), value.data()); @@ -150,8 +146,9 @@ class CuckooReaderTest : public testing::Test { CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); - InternalIterator* it = - reader.NewIterator(ReadOptions(), nullptr, nullptr, false); + InternalIterator* it = reader.NewIterator( + ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized); ASSERT_OK(it->status()); ASSERT_TRUE(!it->Valid()); it->SeekToFirst(); @@ -190,7 +187,9 @@ class CuckooReaderTest : public testing::Test { delete it; Arena arena; - it = reader.NewIterator(ReadOptions(), nullptr, &arena); + it = reader.NewIterator(ReadOptions(), /*prefix_extractor=*/nullptr, &arena, + /*skip_filters=*/false, + TableReaderCaller::kUncategorized); ASSERT_OK(it->status()); ASSERT_TRUE(!it->Valid()); it->Seek(keys[num_items/2]); @@ -337,7 +336,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { AppendInternalKey(¬_found_key, ikey); PinnableSlice value; GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, - Slice(not_found_key), &value, nullptr, nullptr, + Slice(not_found_key), &value, nullptr, nullptr, true, nullptr, nullptr); ASSERT_OK( reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr)); @@ -352,7 +351,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { value.Reset(); GetContext get_context2(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(not_found_key2), &value, - nullptr, nullptr, nullptr, nullptr); + nullptr, nullptr, true, nullptr, nullptr); ASSERT_OK( reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr)); ASSERT_TRUE(value.empty()); @@ -368,7 +367,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { value.Reset(); GetContext get_context3(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(unused_key), &value, - nullptr, nullptr, nullptr, nullptr); + nullptr, nullptr, true, nullptr, nullptr); ASSERT_OK( reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr)); ASSERT_TRUE(value.empty()); @@ -444,7 +443,7 @@ void WriteFile(const std::vector& keys, // Assume only the fast path is triggered GetContext get_context(nullptr, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(), &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); for (uint64_t i = 0; i < num; ++i) { value.Reset(); value.clear(); @@ -492,7 +491,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { // Assume only the fast path is triggered GetContext get_context(nullptr, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(), &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); uint64_t start_time = env->NowMicros(); if (batch_size > 0) { for (uint64_t i = 0; i < num; i += batch_size) { diff --git a/table/format.cc b/table/format.cc index 476db85f731..5e9805f4027 100644 --- a/table/format.cc +++ b/table/format.cc @@ -9,25 +9,24 @@ #include "table/format.h" -#include +#include #include +#include "block_fetcher.h" +#include "file/random_access_file_reader.h" +#include "logging/logging.h" +#include "memory/memory_allocator.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "rocksdb/env.h" -#include "table/block.h" -#include "table/block_based_table_reader.h" -#include "table/block_fetcher.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_reader.h" #include "table/persistent_cache_helper.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" -#include "util/file_reader_writer.h" -#include "util/logging.h" -#include "util/memory_allocator.h" #include "util/stop_watch.h" #include "util/string_util.h" -#include "util/xxhash.h" namespace rocksdb { @@ -91,6 +90,58 @@ std::string BlockHandle::ToString(bool hex) const { const BlockHandle BlockHandle::kNullBlockHandle(0, 0); +void IndexValue::EncodeTo(std::string* dst, bool have_first_key, + const BlockHandle* previous_handle) const { + if (previous_handle) { + assert(handle.offset() == previous_handle->offset() + + previous_handle->size() + kBlockTrailerSize); + PutVarsignedint64(dst, handle.size() - previous_handle->size()); + } else { + handle.EncodeTo(dst); + } + assert(dst->size() != 0); + + if (have_first_key) { + PutLengthPrefixedSlice(dst, first_internal_key); + } +} + +Status IndexValue::DecodeFrom(Slice* input, bool have_first_key, + const BlockHandle* previous_handle) { + if (previous_handle) { + int64_t delta; + if (!GetVarsignedint64(input, &delta)) { + return Status::Corruption("bad delta-encoded index value"); + } + handle = BlockHandle( + previous_handle->offset() + previous_handle->size() + kBlockTrailerSize, + previous_handle->size() + delta); + } else { + Status s = handle.DecodeFrom(input); + if (!s.ok()) { + return s; + } + } + + if (!have_first_key) { + first_internal_key = Slice(); + } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) { + return Status::Corruption("bad first key in block info"); + } + + return Status::OK(); +} + +std::string IndexValue::ToString(bool hex, bool have_first_key) const { + std::string s; + EncodeTo(&s, have_first_key, nullptr); + if (hex) { + return Slice(s).ToString(true); + } else { + return s; + } +} + namespace { inline bool IsLegacyFooterFormat(uint64_t magic_number) { return magic_number == kLegacyBlockBasedTableMagicNumber || diff --git a/table/format.h b/table/format.h index f5858850559..2ed80e2fc35 100644 --- a/table/format.h +++ b/table/format.h @@ -10,23 +10,19 @@ #pragma once #include #include -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif +#include "file/file_prefetch_buffer.h" +#include "file/random_access_file_reader.h" + #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "rocksdb/table.h" +#include "memory/memory_allocator.h" #include "options/cf_options.h" +#include "port/malloc.h" #include "port/port.h" // noexcept #include "table/persistent_cache_options.h" -#include "util/file_reader_writer.h" -#include "util/memory_allocator.h" namespace rocksdb { @@ -76,6 +72,35 @@ class BlockHandle { static const BlockHandle kNullBlockHandle; }; +// Value in block-based table file index. +// +// The index entry for block n is: y -> h, [x], +// where: y is some key between the last key of block n (inclusive) and the +// first key of block n+1 (exclusive); h is BlockHandle pointing to block n; +// x, if present, is the first key of block n (unshortened). +// This struct represents the "h, [x]" part. +struct IndexValue { + BlockHandle handle; + // Empty means unknown. + Slice first_internal_key; + + IndexValue() = default; + IndexValue(BlockHandle _handle, Slice _first_internal_key) + : handle(_handle), first_internal_key(_first_internal_key) {} + + // have_first_key indicates whether the `first_internal_key` is used. + // If previous_handle is not null, delta encoding is used; + // in this case, the two handles must point to consecutive blocks: + // handle.offset() == + // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize + void EncodeTo(std::string* dst, bool have_first_key, + const BlockHandle* previous_handle) const; + Status DecodeFrom(Slice* input, bool have_first_key, + const BlockHandle* previous_handle); + + std::string ToString(bool hex, bool have_first_key) const; +}; + inline uint32_t GetCompressFormatForVersion(CompressionType compression_type, uint32_t version) { #ifdef NDEBUG @@ -92,7 +117,7 @@ inline uint32_t GetCompressFormatForVersion(CompressionType compression_type, } inline bool BlockBasedTableSupportedVersion(uint32_t version) { - return version <= 4; + return version <= 5; } // Footer encapsulates the fixed information stored at the tail @@ -189,11 +214,20 @@ Status ReadFooterFromFile(RandomAccessFileReader* file, // 1-byte type + 32-bit crc static const size_t kBlockTrailerSize = 5; +// Make block size calculation for IO less error prone +inline uint64_t block_size(const BlockHandle& handle) { + return handle.size() + kBlockTrailerSize; +} + inline CompressionType get_block_compression_type(const char* block_data, size_t block_size) { return static_cast(block_data[block_size]); } +// Represents the contents of a block read from an SST file. Depending on how +// it's created, it may or may not own the actual block bytes. As an example, +// BlockContents objects representing data read from mmapped files only point +// into the mmapped region. struct BlockContents { Slice data; // Actual contents of data CacheAllocationPtr allocation; @@ -206,16 +240,20 @@ struct BlockContents { BlockContents() {} + // Does not take ownership of the underlying data bytes. BlockContents(const Slice& _data) : data(_data) {} + // Takes ownership of the underlying data bytes. BlockContents(CacheAllocationPtr&& _data, size_t _size) : data(_data.get(), _size), allocation(std::move(_data)) {} + // Takes ownership of the underlying data bytes. BlockContents(std::unique_ptr&& _data, size_t _size) : data(_data.get(), _size) { allocation.reset(_data.release()); } + // Returns whether the object has ownership of the underlying data bytes. bool own_bytes() const { return allocation.get() != nullptr; } // It's the caller's responsibility to make sure that this is diff --git a/table/full_filter_bits_builder.h b/table/full_filter_bits_builder.h deleted file mode 100644 index 851ed1e2ab4..00000000000 --- a/table/full_filter_bits_builder.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#pragma once - -#include -#include -#include - -#include "rocksdb/filter_policy.h" - -namespace rocksdb { - -class Slice; - -class FullFilterBitsBuilder : public FilterBitsBuilder { - public: - explicit FullFilterBitsBuilder(const size_t bits_per_key, - const size_t num_probes); - - ~FullFilterBitsBuilder(); - - virtual void AddKey(const Slice& key) override; - - // Create a filter that for hashes [0, n-1], the filter is allocated here - // When creating filter, it is ensured that - // total_bits = num_lines * CACHE_LINE_SIZE * 8 - // dst len is >= 5, 1 for num_probes, 4 for num_lines - // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated - // +----------------------------------------------------------------+ - // | filter data with length total_bits/8 | - // +----------------------------------------------------------------+ - // | | - // | ... | - // | | - // +----------------------------------------------------------------+ - // | ... | num_probes : 1 byte | num_lines : 4 bytes | - // +----------------------------------------------------------------+ - virtual Slice Finish(std::unique_ptr* buf) override; - - // Calculate num of entries fit into a space. - virtual int CalculateNumEntry(const uint32_t space) override; - - // Calculate space for new filter. This is reverse of CalculateNumEntry. - uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits, - uint32_t* num_lines); - - private: - friend class FullFilterBlockTest_DuplicateEntries_Test; - size_t bits_per_key_; - size_t num_probes_; - std::vector hash_entries_; - - // Get totalbits that optimized for cpu cache line - uint32_t GetTotalBitsForLocality(uint32_t total_bits); - - // Reserve space for new filter - char* ReserveSpace(const int num_entry, uint32_t* total_bits, - uint32_t* num_lines); - - // Assuming single threaded access to this function. - void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits); - - // No Copy allowed - FullFilterBitsBuilder(const FullFilterBitsBuilder&); - void operator=(const FullFilterBitsBuilder&); -}; - -} // namespace rocksdb diff --git a/table/full_filter_block.h b/table/full_filter_block.h deleted file mode 100644 index f97952a7ced..00000000000 --- a/table/full_filter_block.h +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once - -#include -#include -#include -#include -#include -#include "rocksdb/options.h" -#include "rocksdb/slice.h" -#include "rocksdb/slice_transform.h" -#include "db/dbformat.h" -#include "util/hash.h" -#include "table/filter_block.h" - -namespace rocksdb { - -class FilterPolicy; -class FilterBitsBuilder; -class FilterBitsReader; - -// A FullFilterBlockBuilder is used to construct a full filter for a -// particular Table. It generates a single string which is stored as -// a special block in the Table. -// The format of full filter block is: -// +----------------------------------------------------------------+ -// | full filter for all keys in sst file | -// +----------------------------------------------------------------+ -// The full filter can be very large. At the end of it, we put -// num_probes: how many hash functions are used in bloom filter -// -class FullFilterBlockBuilder : public FilterBlockBuilder { - public: - explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, - bool whole_key_filtering, - FilterBitsBuilder* filter_bits_builder); - // bits_builder is created in filter_policy, it should be passed in here - // directly. and be deleted here - ~FullFilterBlockBuilder() {} - - virtual bool IsBlockBased() override { return false; } - virtual void StartBlock(uint64_t /*block_offset*/) override {} - virtual void Add(const Slice& key) override; - virtual size_t NumAdded() const override { return num_added_; } - virtual Slice Finish(const BlockHandle& tmp, Status* status) override; - using FilterBlockBuilder::Finish; - - protected: - virtual void AddKey(const Slice& key); - std::unique_ptr filter_bits_builder_; - virtual void Reset(); - - private: - // important: all of these might point to invalid addresses - // at the time of destruction of this filter block. destructor - // should NOT dereference them. - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - bool last_whole_key_recorded_; - std::string last_whole_key_str_; - bool last_prefix_recorded_; - std::string last_prefix_str_; - - uint32_t num_added_; - std::unique_ptr filter_data_; - - void AddPrefix(const Slice& key); - - // No copying allowed - FullFilterBlockBuilder(const FullFilterBlockBuilder&); - void operator=(const FullFilterBlockBuilder&); -}; - -// A FilterBlockReader is used to parse filter from SST table. -// KeyMayMatch and PrefixMayMatch would trigger filter checking -class FullFilterBlockReader : public FilterBlockReader { - public: - // REQUIRES: "contents" and filter_bits_reader must stay live - // while *this is live. - explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, - bool whole_key_filtering, - const Slice& contents, - FilterBitsReader* filter_bits_reader, - Statistics* statistics); - explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, - bool whole_key_filtering, - BlockContents&& contents, - FilterBitsReader* filter_bits_reader, - Statistics* statistics); - - // bits_reader is created in filter_policy, it should be passed in here - // directly. and be deleted here - ~FullFilterBlockReader() {} - - virtual bool IsBlockBased() override { return false; } - - virtual bool KeyMayMatch( - const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - - virtual bool PrefixMayMatch( - const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - - virtual void KeysMayMatch(MultiGetRange* range, - const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false) override; - - virtual void PrefixesMayMatch(MultiGetRange* range, - const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, - const bool no_io = false) override; - virtual size_t ApproximateMemoryUsage() const override; - virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, - const SliceTransform* prefix_extractor, - const Comparator* comparator, - const Slice* const const_ikey_ptr, bool* filter_checked, - bool need_upper_bound_check) override; - private: - const SliceTransform* prefix_extractor_; - Slice contents_; - std::unique_ptr filter_bits_reader_; - BlockContents block_contents_; - bool full_length_enabled_; - size_t prefix_extractor_full_length_; - - // No copying allowed - FullFilterBlockReader(const FullFilterBlockReader&); - bool MayMatch(const Slice& entry); - void MayMatch(MultiGetRange* range); - void operator=(const FullFilterBlockReader&); - bool IsFilterCompatible(const Slice* iterate_upper_bound, - const Slice& prefix, const Comparator* comparator); - -}; - -} // namespace rocksdb diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc deleted file mode 100644 index 3abae979a4c..00000000000 --- a/table/full_filter_block_test.cc +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include "table/full_filter_block.h" - -#include "rocksdb/filter_policy.h" -#include "table/full_filter_bits_builder.h" -#include "util/coding.h" -#include "util/hash.h" -#include "util/string_util.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -class TestFilterBitsBuilder : public FilterBitsBuilder { - public: - explicit TestFilterBitsBuilder() {} - - // Add Key to filter - void AddKey(const Slice& key) override { - hash_entries_.push_back(Hash(key.data(), key.size(), 1)); - } - - // Generate the filter using the keys that are added - Slice Finish(std::unique_ptr* buf) override { - uint32_t len = static_cast(hash_entries_.size()) * 4; - char* data = new char[len]; - for (size_t i = 0; i < hash_entries_.size(); i++) { - EncodeFixed32(data + i * 4, hash_entries_[i]); - } - const char* const_data = data; - buf->reset(const_data); - return Slice(data, len); - } - - private: - std::vector hash_entries_; -}; - -class TestFilterBitsReader : public FilterBitsReader { - public: - explicit TestFilterBitsReader(const Slice& contents) - : data_(contents.data()), len_(static_cast(contents.size())) {} - - // Silence compiler warning about overloaded virtual - using FilterBitsReader::MayMatch; - bool MayMatch(const Slice& entry) override { - uint32_t h = Hash(entry.data(), entry.size(), 1); - for (size_t i = 0; i + 4 <= len_; i += 4) { - if (h == DecodeFixed32(data_ + i)) { - return true; - } - } - return false; - } - - private: - const char* data_; - uint32_t len_; -}; - - -class TestHashFilter : public FilterPolicy { - public: - const char* Name() const override { return "TestHashFilter"; } - - void CreateFilter(const Slice* keys, int n, std::string* dst) const override { - for (int i = 0; i < n; i++) { - uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); - PutFixed32(dst, h); - } - } - - bool KeyMayMatch(const Slice& key, const Slice& filter) const override { - uint32_t h = Hash(key.data(), key.size(), 1); - for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { - if (h == DecodeFixed32(filter.data() + i)) { - return true; - } - } - return false; - } - - FilterBitsBuilder* GetFilterBitsBuilder() const override { - return new TestFilterBitsBuilder(); - } - - FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override { - return new TestFilterBitsReader(contents); - } -}; - -class PluginFullFilterBlockTest : public testing::Test { - public: - BlockBasedTableOptions table_options_; - - PluginFullFilterBlockTest() { - table_options_.filter_policy.reset(new TestHashFilter()); - } -}; - -TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { - FullFilterBlockBuilder builder( - nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); - Slice block = builder.Finish(); - ASSERT_EQ("", EscapeString(block)); - - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); -} - -TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { - FullFilterBlockBuilder builder( - nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); - builder.Add("foo"); - builder.Add("bar"); - builder.Add("box"); - builder.Add("box"); - builder.Add("hello"); - Slice block = builder.Finish(); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); - ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr)); - ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr)); -} - -class FullFilterBlockTest : public testing::Test { - public: - BlockBasedTableOptions table_options_; - - FullFilterBlockTest() { - table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); - } - - ~FullFilterBlockTest() override {} -}; - -TEST_F(FullFilterBlockTest, EmptyBuilder) { - FullFilterBlockBuilder builder( - nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); - Slice block = builder.Finish(); - ASSERT_EQ("", EscapeString(block)); - - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - // Remain same symantic with blockbased filter - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); -} - -TEST_F(FullFilterBlockTest, DuplicateEntries) { - { // empty prefixes - std::unique_ptr prefix_extractor( - NewFixedPrefixTransform(0)); - auto bits_builder = dynamic_cast( - table_options_.filter_policy->GetFilterBitsBuilder()); - const bool WHOLE_KEY = true; - FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, - bits_builder); - ASSERT_EQ(0, builder.NumAdded()); - builder.Add("key"); // test with empty prefix - ASSERT_EQ(2, bits_builder->hash_entries_.size()); - } - - // mix of empty and non-empty - std::unique_ptr prefix_extractor( - NewFixedPrefixTransform(7)); - auto bits_builder = dynamic_cast( - table_options_.filter_policy->GetFilterBitsBuilder()); - const bool WHOLE_KEY = true; - FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY, - bits_builder); - ASSERT_EQ(0, builder.NumAdded()); - builder.Add(""); // test with empty key too - builder.Add("prefix1key1"); - builder.Add("prefix1key1"); - builder.Add("prefix1key2"); - builder.Add("prefix1key3"); - builder.Add("prefix2key4"); - // two prefix adn 4 keys - ASSERT_EQ(1 + 2 + 4, bits_builder->hash_entries_.size()); -} - -TEST_F(FullFilterBlockTest, SingleChunk) { - FullFilterBlockBuilder builder( - nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder()); - ASSERT_EQ(0, builder.NumAdded()); - builder.Add("foo"); - builder.Add("bar"); - builder.Add("box"); - builder.Add("box"); - builder.Add("hello"); - ASSERT_EQ(5, builder.NumAdded()); - Slice block = builder.Finish(); - FullFilterBlockReader reader( - nullptr, true, block, - table_options_.filter_policy->GetFilterBitsReader(block), nullptr); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("box", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr)); - ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr)); - ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr)); - ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr)); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/table/get_context.cc b/table/get_context.cc index 24c9ba7d5b7..cdb5798f782 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -38,15 +38,13 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) { } // namespace -GetContext::GetContext(const Comparator* ucmp, - const MergeOperator* merge_operator, Logger* logger, - Statistics* statistics, GetState init_state, - const Slice& user_key, PinnableSlice* pinnable_val, - bool* value_found, MergeContext* merge_context, - SequenceNumber* _max_covering_tombstone_seq, Env* env, - SequenceNumber* seq, - PinnedIteratorsManager* _pinned_iters_mgr, - ReadCallback* callback, bool* is_blob_index) +GetContext::GetContext( + const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, + Statistics* statistics, GetState init_state, const Slice& user_key, + PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context, + bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env, + SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr, + ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id) : ucmp_(ucmp), merge_operator_(merge_operator), logger_(logger), @@ -62,7 +60,9 @@ GetContext::GetContext(const Comparator* ucmp, replay_log_(nullptr), pinned_iters_mgr_(_pinned_iters_mgr), callback_(callback), - is_blob_index_(is_blob_index) { + do_merge_(do_merge), + is_blob_index_(is_blob_index), + tracing_get_id_(tracing_get_id) { if (seq_) { *seq_ = kMaxSequenceNumber; } @@ -182,7 +182,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, assert(matched); assert((state_ != kMerge && parsed_key.type != kTypeMerge) || merge_context_ != nullptr); - if (ucmp_->Equal(parsed_key.user_key, user_key_)) { + if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) { *matched = true; // If the value is not in the snapshot, skip it if (!CheckCallback(parsed_key.sequence)) { @@ -216,29 +216,44 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } if (kNotFound == state_) { state_ = kFound; - if (LIKELY(pinnable_val_ != nullptr)) { - if (LIKELY(value_pinner != nullptr)) { - // If the backing resources for the value are provided, pin them - pinnable_val_->PinSlice(value, value_pinner); - } else { - TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this); + if (do_merge_) { + if (LIKELY(pinnable_val_ != nullptr)) { + if (LIKELY(value_pinner != nullptr)) { + // If the backing resources for the value are provided, pin them + pinnable_val_->PinSlice(value, value_pinner); + } else { + TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", + this); - // Otherwise copy the value - pinnable_val_->PinSelf(value); + // Otherwise copy the value + pinnable_val_->PinSelf(value); + } } + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(value, value_pinner); } } else if (kMerge == state_) { assert(merge_operator_ != nullptr); state_ = kFound; - if (LIKELY(pinnable_val_ != nullptr)) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, &value, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; + if (do_merge_) { + if (LIKELY(pinnable_val_ != nullptr)) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, &value, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } } + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + push_operand(value, value_pinner); } } if (is_blob_index_ != nullptr) { @@ -257,14 +272,18 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } else if (kMerge == state_) { state_ = kFound; if (LIKELY(pinnable_val_ != nullptr)) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, nullptr, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; + if (do_merge_) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, nullptr, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } } + // If do_merge_ = false then the current value shouldn't be part of + // merge_context_->operand_list } } return false; @@ -273,24 +292,23 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, assert(state_ == kNotFound || state_ == kMerge); state_ = kMerge; // value_pinner is not set from plain_table_reader.cc for example. - if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && - value_pinner != nullptr) { - value_pinner->DelegateCleanupsTo(pinned_iters_mgr()); - merge_context_->PushOperand(value, true /*value_pinned*/); - } else { - merge_context_->PushOperand(value, false); - } - if (merge_operator_ != nullptr && - merge_operator_->ShouldMerge(merge_context_->GetOperandsDirectionBackward())) { + push_operand(value, value_pinner); + if (do_merge_ && merge_operator_ != nullptr && + merge_operator_->ShouldMerge( + merge_context_->GetOperandsDirectionBackward())) { state_ = kFound; if (LIKELY(pinnable_val_ != nullptr)) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, nullptr, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), - logger_, statistics_, env_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { - state_ = kCorrupt; + // do_merge_ = true this is the case where this function is called + // as part of DB Get API hence merge operators should be merged. + if (do_merge_) { + Status merge_status = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, nullptr, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), + logger_, statistics_, env_); + pinnable_val_->PinSelf(); + if (!merge_status.ok()) { + state_ = kCorrupt; + } } } return false; @@ -307,6 +325,16 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, return false; } +void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) { + if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && + value_pinner != nullptr) { + value_pinner->DelegateCleanupsTo(pinned_iters_mgr()); + merge_context_->PushOperand(value, true /*value_pinned*/); + } else { + merge_context_->PushOperand(value, false); + } +} + void replayGetContextLog(const Slice& replay_log, const Slice& user_key, GetContext* get_context, Cleanable* value_pinner) { #ifndef ROCKSDB_LITE diff --git a/table/get_context.h b/table/get_context.h index 7ed316f0e1a..8a2f24464bc 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -5,18 +5,21 @@ #pragma once #include -#include +#include "db/dbformat.h" #include "db/merge_context.h" #include "db/read_callback.h" #include "rocksdb/env.h" #include "rocksdb/statistics.h" #include "rocksdb/types.h" -#include "table/block.h" +#include "table/block_based/block.h" namespace rocksdb { class MergeContext; class PinnedIteratorsManager; +// Data structure for accumulating statistics during a point lookup. At the +// end of the point lookup, the corresponding ticker stats are updated. This +// avoids the overhead of frequent ticker stats updates struct GetContextStats { uint64_t num_cache_hit = 0; uint64_t num_cache_index_hit = 0; @@ -41,8 +44,17 @@ struct GetContextStats { uint64_t num_cache_compression_dict_bytes_insert = 0; }; +// A class to hold context about a point lookup, such as pointer to value +// slice, key, merge context etc, as well as the current state of the +// lookup. Any user using GetContext to track the lookup result must call +// SaveValue() whenever the internal key is found. This can happen +// repeatedly in case of merge operands. In case the key may exist with +// high probability, but IO is required to confirm and the user doesn't allow +// it, MarkKeyMayExist() must be called instead of SaveValue(). class GetContext { public: + // Current state of the point lookup. All except kNotFound and kMerge are + // terminal states enum GetState { kNotFound, kFound, @@ -53,24 +65,47 @@ class GetContext { }; GetContextStats get_context_stats_; + // Constructor + // @param value Holds the value corresponding to user_key. If its nullptr + // then return all merge operands corresponding to user_key + // via merge_context + // @param value_found If non-nullptr, set to false if key may be present + // but we can't be certain because we cannot do IO + // @param max_covering_tombstone_seq Pointer to highest sequence number of + // range deletion covering the key. When an internal key + // is found with smaller sequence number, the lookup + // terminates + // @param seq If non-nullptr, the sequence number of the found key will be + // saved here + // @param callback Pointer to ReadCallback to perform additional checks + // for visibility of a key + // @param is_blob_index If non-nullptr, will be used to indicate if a found + // key is of type blob index + // @param do_merge True if value associated with user_key has to be returned + // and false if all the merge operands associated with user_key has to be + // returned. Id do_merge=false then all the merge operands are stored in + // merge_context and they are never merged. The value pointer is untouched. GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger, Statistics* statistics, GetState init_state, const Slice& user_key, PinnableSlice* value, bool* value_found, - MergeContext* merge_context, + MergeContext* merge_context, bool do_merge, SequenceNumber* max_covering_tombstone_seq, Env* env, SequenceNumber* seq = nullptr, PinnedIteratorsManager* _pinned_iters_mgr = nullptr, - ReadCallback* callback = nullptr, bool* is_blob_index = nullptr); + ReadCallback* callback = nullptr, bool* is_blob_index = nullptr, + uint64_t tracing_get_id = 0); - GetContext() = default; + GetContext() = delete; + // This can be called to indicate that a key may be present, but cannot be + // confirmed due to IO not allowed void MarkKeyMayExist(); // Records this key, value, and any meta-data (such as sequence number and // state) into this GetContext. // // If the parsed_key matches the user key that we are looking for, sets - // mathced to true. + // matched to true. // // Returns True if more keys need to be read (due to merges) or // False if the complete value has been found. @@ -108,6 +143,12 @@ class GetContext { void ReportCounters(); + bool has_callback() const { return callback_ != nullptr; } + + uint64_t get_tracing_get_id() const { return tracing_get_id_; } + + void push_operand(const Slice& value, Cleanable* value_pinner); + private: const Comparator* ucmp_; const MergeOperator* merge_operator_; @@ -130,9 +171,19 @@ class GetContext { PinnedIteratorsManager* pinned_iters_mgr_; ReadCallback* callback_; bool sample_; + // Value is true if it's called as part of DB Get API and false if it's + // called as part of DB GetMergeOperands API. When it's false merge operators + // are never merged. + bool do_merge_; bool* is_blob_index_; + // Used for block cache tracing only. A tracing get id uniquely identifies a + // Get or a MultiGet. + const uint64_t tracing_get_id_; }; +// Call this to replay a log and bring the get_context up to date. The replay +// log must have been created by another GetContext object, whose replay log +// must have been set by calling GetContext::SetReplayLog(). void replayGetContextLog(const Slice& replay_log, const Slice& user_key, GetContext* get_context, Cleanable* value_pinner = nullptr); diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 6b713e7b951..d7940eeffa9 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -17,10 +17,20 @@ namespace rocksdb { class PinnedIteratorsManager; +struct IterateResult { + Slice key; + bool may_be_out_of_upper_bound; +}; + template class InternalIteratorBase : public Cleanable { public: InternalIteratorBase() {} + + // No copying allowed + InternalIteratorBase(const InternalIteratorBase&) = delete; + InternalIteratorBase& operator=(const InternalIteratorBase&) = delete; + virtual ~InternalIteratorBase() {} // An iterator is either positioned at a key/value pair, or @@ -54,11 +64,20 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual void Next() = 0; - virtual bool NextAndGetResult(Slice* ret_key) { + // Moves to the next entry in the source, and return result. Iterator + // implementation should override this method to help methods inline better, + // or when MayBeOutOfUpperBound() is non-trivial. + // REQUIRES: Valid() + virtual bool NextAndGetResult(IterateResult* result) { Next(); bool is_valid = Valid(); if (is_valid) { - *ret_key = key(); + result->key = key(); + // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual + // call. If an implementation has non-trivial MayBeOutOfUpperBound(), + // it should also override NextAndGetResult(). + result->may_be_out_of_upper_bound = true; + assert(MayBeOutOfUpperBound()); } return is_valid; } @@ -89,10 +108,20 @@ class InternalIteratorBase : public Cleanable { // satisfied without doing some IO, then this returns Status::Incomplete(). virtual Status status() const = 0; - // True if the iterator is invalidated because it is out of the iterator - // upper bound + // True if the iterator is invalidated because it reached a key that is above + // the iterator upper bound. Used by LevelIterator to decide whether it should + // stop or move on to the next file. + // Important: if iterator reached the end of the file without encountering any + // keys above the upper bound, IsOutOfBound() must return false. virtual bool IsOutOfBound() { return false; } + // Keys return from this iterator can be smaller than iterate_lower_bound. + virtual bool MayBeOutOfLowerBound() { return true; } + + // Keys return from this iterator can be larger or equal to + // iterate_upper_bound. + virtual bool MayBeOutOfUpperBound() { return true; } + // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont // communicate with PinnedIteratorsManager so default implementation is no-op // but for Iterators that need to communicate with PinnedIteratorsManager @@ -131,10 +160,7 @@ class InternalIteratorBase : public Cleanable { } } - private: - // No copying allowed - InternalIteratorBase(const InternalIteratorBase&) = delete; - InternalIteratorBase& operator=(const InternalIteratorBase&) = delete; + bool is_mutable_; }; using InternalIterator = InternalIteratorBase; diff --git a/table/iterator.cc b/table/iterator.cc index 0475b9d1342..f6c7f9cec3f 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -8,9 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/iterator.h" +#include "memory/arena.h" #include "table/internal_iterator.h" #include "table/iterator_wrapper.h" -#include "util/arena.h" namespace rocksdb { @@ -167,7 +167,7 @@ template InternalIteratorBase* NewErrorInternalIterator(const Status& status) { return new EmptyInternalIterator(status); } -template InternalIteratorBase* NewErrorInternalIterator( +template InternalIteratorBase* NewErrorInternalIterator( const Status& status); template InternalIteratorBase* NewErrorInternalIterator( const Status& status); @@ -182,7 +182,7 @@ InternalIteratorBase* NewErrorInternalIterator(const Status& status, return new (mem) EmptyInternalIterator(status); } } -template InternalIteratorBase* NewErrorInternalIterator( +template InternalIteratorBase* NewErrorInternalIterator( const Status& status, Arena* arena); template InternalIteratorBase* NewErrorInternalIterator( const Status& status, Arena* arena); @@ -191,7 +191,7 @@ template InternalIteratorBase* NewEmptyInternalIterator() { return new EmptyInternalIterator(Status::OK()); } -template InternalIteratorBase* NewEmptyInternalIterator(); +template InternalIteratorBase* NewEmptyInternalIterator(); template InternalIteratorBase* NewEmptyInternalIterator(); template @@ -203,7 +203,7 @@ InternalIteratorBase* NewEmptyInternalIterator(Arena* arena) { return new (mem) EmptyInternalIterator(Status::OK()); } } -template InternalIteratorBase* NewEmptyInternalIterator( +template InternalIteratorBase* NewEmptyInternalIterator( Arena* arena); template InternalIteratorBase* NewEmptyInternalIterator(Arena* arena); diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index fc5eb2613d8..f8fdde565ec 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -12,6 +12,7 @@ #include #include "table/internal_iterator.h" +#include "test_util/sync_point.h" namespace rocksdb { @@ -56,7 +57,10 @@ class IteratorWrapperBase { // Iterator interface methods bool Valid() const { return valid_; } - Slice key() const { assert(Valid()); return key_; } + Slice key() const { + assert(Valid()); + return result_.key; + } TValue value() const { assert(Valid()); return iter_->value(); @@ -65,11 +69,15 @@ class IteratorWrapperBase { Status status() const { assert(iter_); return iter_->status(); } void Next() { assert(iter_); - valid_ = iter_->NextAndGetResult(&key_); + valid_ = iter_->NextAndGetResult(&result_); assert(!valid_ || iter_->status().ok()); } void Prev() { assert(iter_); iter_->Prev(); Update(); } - void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void Seek(const Slice& k) { + assert(iter_); + iter_->Seek(k); + Update(); + } void SeekForPrev(const Slice& k) { assert(iter_); iter_->SeekForPrev(k); @@ -78,6 +86,16 @@ class IteratorWrapperBase { void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } + bool MayBeOutOfLowerBound() { + assert(Valid()); + return iter_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() { + assert(Valid()); + return result_.may_be_out_of_upper_bound; + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) { assert(iter_); iter_->SetPinnedItersMgr(pinned_iters_mgr); @@ -95,14 +113,15 @@ class IteratorWrapperBase { void Update() { valid_ = iter_->Valid(); if (valid_) { - key_ = iter_->key(); assert(iter_->status().ok()); + result_.key = iter_->key(); + result_.may_be_out_of_upper_bound = true; } } InternalIteratorBase* iter_; + IterateResult result_; bool valid_; - Slice key_; }; using IteratorWrapper = IteratorWrapperBase; diff --git a/table/merger_test.cc b/table/merger_test.cc index 1b04d065727..8efa2834db6 100644 --- a/table/merger_test.cc +++ b/table/merger_test.cc @@ -7,8 +7,8 @@ #include #include "table/merging_iterator.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" namespace rocksdb { diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index bd4a186b3c2..2ee379b052e 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -12,6 +12,7 @@ #include #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" +#include "memory/arena.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" @@ -19,11 +20,10 @@ #include "table/internal_iterator.h" #include "table/iter_heap.h" #include "table/iterator_wrapper.h" -#include "util/arena.h" +#include "test_util/sync_point.h" #include "util/autovector.h" #include "util/heap.h" #include "util/stop_watch.h" -#include "util/sync_point.h" namespace rocksdb { // Without anonymous namespace here, we fail the warning -Wmissing-prototypes @@ -51,12 +51,7 @@ class MergingIterator : public InternalIterator { children_[i].Set(children[i]); } for (auto& child : children_) { - if (child.Valid()) { - assert(child.status().ok()); - minHeap_.push(&child); - } else { - considerStatus(child.status()); - } + AddToMinHeapOrCheckStatus(&child); } current_ = CurrentForward(); } @@ -74,12 +69,9 @@ class MergingIterator : public InternalIterator { iter->SetPinnedItersMgr(pinned_iters_mgr_); } auto new_wrapper = children_.back(); + AddToMinHeapOrCheckStatus(&new_wrapper); if (new_wrapper.Valid()) { - assert(new_wrapper.status().ok()); - minHeap_.push(&new_wrapper); current_ = CurrentForward(); - } else { - considerStatus(new_wrapper.status()); } } @@ -98,12 +90,7 @@ class MergingIterator : public InternalIterator { status_ = Status::OK(); for (auto& child : children_) { child.SeekToFirst(); - if (child.Valid()) { - assert(child.status().ok()); - minHeap_.push(&child); - } else { - considerStatus(child.status()); - } + AddToMinHeapOrCheckStatus(&child); } direction_ = kForward; current_ = CurrentForward(); @@ -115,12 +102,7 @@ class MergingIterator : public InternalIterator { status_ = Status::OK(); for (auto& child : children_) { child.SeekToLast(); - if (child.Valid()) { - assert(child.status().ok()); - maxHeap_->push(&child); - } else { - considerStatus(child.status()); - } + AddToMaxHeapOrCheckStatus(&child); } direction_ = kReverse; current_ = CurrentReverse(); @@ -134,14 +116,13 @@ class MergingIterator : public InternalIterator { PERF_TIMER_GUARD(seek_child_seek_time); child.Seek(target); } - PERF_COUNTER_ADD(seek_child_seek_count, 1); - if (child.Valid()) { - assert(child.status().ok()); + PERF_COUNTER_ADD(seek_child_seek_count, 1); + { + // Strictly, we timed slightly more than min heap operation, + // but these operations are very cheap. PERF_TIMER_GUARD(seek_min_heap_time); - minHeap_.push(&child); - } else { - considerStatus(child.status()); + AddToMinHeapOrCheckStatus(&child); } } direction_ = kForward; @@ -163,12 +144,9 @@ class MergingIterator : public InternalIterator { } PERF_COUNTER_ADD(seek_child_seek_count, 1); - if (child.Valid()) { - assert(child.status().ok()); + { PERF_TIMER_GUARD(seek_max_heap_time); - maxHeap_->push(&child); - } else { - considerStatus(child.status()); + AddToMaxHeapOrCheckStatus(&child); } } direction_ = kReverse; @@ -212,6 +190,16 @@ class MergingIterator : public InternalIterator { current_ = CurrentForward(); } + bool NextAndGetResult(IterateResult* result) override { + Next(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->may_be_out_of_upper_bound = MayBeOutOfUpperBound(); + } + return is_valid; + } + void Prev() override { assert(Valid()); // Ensure that all children are positioned before key(). @@ -221,35 +209,7 @@ class MergingIterator : public InternalIterator { if (direction_ != kReverse) { // Otherwise, retreat the non-current children. We retreat current_ // just after the if-block. - ClearHeaps(); - InitMaxHeap(); - Slice target = key(); - for (auto& child : children_) { - if (&child != current_) { - child.SeekForPrev(target); - TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child); - considerStatus(child.status()); - if (child.Valid() && comparator_->Equal(target, child.key())) { - child.Prev(); - considerStatus(child.status()); - } - } - if (child.Valid()) { - assert(child.status().ok()); - maxHeap_->push(&child); - } - } - direction_ = kReverse; - if (!prefix_seek_mode_) { - // Note that we don't do assert(current_ == CurrentReverse()) here - // because it is possible to have some keys larger than the seek-key - // inserted between Seek() and SeekToLast(), which makes current_ not - // equal to CurrentReverse(). - current_ = CurrentReverse(); - } - // The loop advanced all non-current children to be < key() so current_ - // should still be strictly the smallest key. - assert(current_ == CurrentReverse()); + SwitchToBackward(); } // For the heap modifications below to be correct, current_ must be the @@ -281,6 +241,20 @@ class MergingIterator : public InternalIterator { return current_->value(); } + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result + // from current child iterator. Potentially as long as one of child iterator + // report out of bound is not possible, we know current key is within bound. + + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return current_->MayBeOutOfLowerBound(); + } + + bool MayBeOutOfUpperBound() override { + assert(Valid()); + return current_->MayBeOutOfUpperBound(); + } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; for (auto& child : children_) { @@ -331,8 +305,20 @@ class MergingIterator : public InternalIterator { std::unique_ptr maxHeap_; PinnedIteratorsManager* pinned_iters_mgr_; + // In forward direction, process a child that is not in the min heap. + // If valid, add to the min heap. Otherwise, check status. + void AddToMinHeapOrCheckStatus(IteratorWrapper*); + + // In backward direction, process a child that is not in the max heap. + // If valid, add to the min heap. Otherwise, check status. + void AddToMaxHeapOrCheckStatus(IteratorWrapper*); + void SwitchToForward(); + // Switch the direction from forward to backward without changing the + // position. Iterator should still be valid. + void SwitchToBackward(); + IteratorWrapper* CurrentForward() const { assert(direction_ == kForward); return !minHeap_.empty() ? minHeap_.top() : nullptr; @@ -345,6 +331,24 @@ class MergingIterator : public InternalIterator { } }; +void MergingIterator::AddToMinHeapOrCheckStatus(IteratorWrapper* child) { + if (child->Valid()) { + assert(child->status().ok()); + minHeap_.push(child); + } else { + considerStatus(child->status()); + } +} + +void MergingIterator::AddToMaxHeapOrCheckStatus(IteratorWrapper* child) { + if (child->Valid()) { + assert(child->status().ok()); + maxHeap_->push(child); + } else { + considerStatus(child->status()); + } +} + void MergingIterator::SwitchToForward() { // Otherwise, advance the non-current children. We advance current_ // just after the if-block. @@ -353,19 +357,42 @@ void MergingIterator::SwitchToForward() { for (auto& child : children_) { if (&child != current_) { child.Seek(target); - considerStatus(child.status()); if (child.Valid() && comparator_->Equal(target, child.key())) { + assert(child.status().ok()); child.Next(); - considerStatus(child.status()); } } - if (child.Valid()) { - minHeap_.push(&child); - } + AddToMinHeapOrCheckStatus(&child); } direction_ = kForward; } +void MergingIterator::SwitchToBackward() { + ClearHeaps(); + InitMaxHeap(); + Slice target = key(); + for (auto& child : children_) { + if (&child != current_) { + child.SeekForPrev(target); + TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child); + if (child.Valid() && comparator_->Equal(target, child.key())) { + assert(child.status().ok()); + child.Prev(); + } + } + AddToMaxHeapOrCheckStatus(&child); + } + direction_ = kReverse; + if (!prefix_seek_mode_) { + // Note that we don't do assert(current_ == CurrentReverse()) here + // because it is possible to have some keys larger than the seek-key + // inserted between Seek() and SeekToLast(), which makes current_ not + // equal to CurrentReverse(). + current_ = CurrentReverse(); + } + assert(current_ == CurrentReverse()); +} + void MergingIterator::ClearHeaps() { minHeap_.clear(); if (maxHeap_) { diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 3f48095c55b..1ba52d6e1c7 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -7,18 +7,18 @@ #include #include +#include "block_fetcher.h" #include "db/table_properties_collector.h" +#include "file/random_access_file_reader.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" -#include "table/block.h" -#include "table/block_fetcher.h" +#include "table/block_based/block.h" #include "table/format.h" #include "table/internal_iterator.h" #include "table/persistent_cache_helper.h" #include "table/table_properties_internal.h" +#include "test_util/sync_point.h" #include "util/coding.h" -#include "util/file_reader_writer.h" -#include "util/sync_point.h" namespace rocksdb { @@ -216,7 +216,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, BlockFetcher block_fetcher( file, prefetch_buffer, footer, read_options, handle, &block_contents, ioptions, false /* decompress */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + BlockType::kProperties, UncompressionDict::GetEmptyDict(), cache_options, + memory_allocator); s = block_fetcher.ReadBlockContents(); // property block is never compressed. Need to add uncompress logic if we are // to compress it.. @@ -228,8 +229,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, Block properties_block(std::move(block_contents), kDisableGlobalSequenceNumber); DataBlockIter iter; - properties_block.NewIterator(BytewiseComparator(), - BytewiseComparator(), &iter); + properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(), + &iter); auto new_table_properties = new TableProperties(); // All pre-defined properties of type uint64_t @@ -375,8 +376,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, BlockFetcher block_fetcher( file, nullptr /* prefetch_buffer */, footer, read_options, metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options, memory_allocator); + false /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; @@ -385,9 +386,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, // are to compress it. Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); - std::unique_ptr meta_iter( - metaindex_block.NewIterator(BytewiseComparator(), - BytewiseComparator())); + std::unique_ptr meta_iter(metaindex_block.NewDataIterator( + BytewiseComparator(), BytewiseComparator())); // -- Read property block bool found_properties_block = true; @@ -446,7 +446,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, file, nullptr /* prefetch_buffer */, footer, read_options, metaindex_handle, &metaindex_contents, ioptions, false /* do decompression */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), cache_options, + memory_allocator); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; @@ -457,8 +458,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, kDisableGlobalSequenceNumber); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewIterator( - BytewiseComparator(), BytewiseComparator())); + meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(), + BytewiseComparator())); return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } @@ -467,7 +468,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableCFOptions& ioptions, - const std::string& meta_block_name, + const std::string& meta_block_name, BlockType block_type, BlockContents* contents, bool /*compression_type_missing*/, MemoryAllocator* memory_allocator) { Status status; @@ -485,11 +486,11 @@ Status ReadMetaBlock(RandomAccessFileReader* file, read_options.verify_checksums = false; PersistentCacheOptions cache_options; - BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options, - metaindex_handle, &metaindex_contents, ioptions, - false /* decompress */, false /*maybe_compressed*/, - UncompressionDict::GetEmptyDict(), cache_options, - memory_allocator); + BlockFetcher block_fetcher( + file, prefetch_buffer, footer, read_options, metaindex_handle, + &metaindex_contents, ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); status = block_fetcher.ReadBlockContents(); if (!status.ok()) { return status; @@ -502,8 +503,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file, kDisableGlobalSequenceNumber); std::unique_ptr meta_iter; - meta_iter.reset(metaindex_block.NewIterator( - BytewiseComparator(), BytewiseComparator())); + meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(), + BytewiseComparator())); BlockHandle block_handle; status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); @@ -515,7 +516,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, // Reading metablock BlockFetcher block_fetcher2( file, prefetch_buffer, footer, read_options, block_handle, contents, - ioptions, false /* decompress */, false /*maybe_compressed*/, + ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); return block_fetcher2.ReadBlockContents(); } diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 6efd1225e19..63d66497f63 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -15,7 +15,8 @@ #include "rocksdb/memory_allocator.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" -#include "table/block_builder.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/block_type.h" #include "table/format.h" #include "util/kv_map.h" @@ -88,7 +89,7 @@ void NotifyCollectTableCollectorsOnBlockAdd( uint64_t blockRawBytes, uint64_t blockCompressedBytesFast, uint64_t blockCompressedBytesSlow); -// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all +// NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all // property collectors. The collected properties will be added to `builder`. bool NotifyCollectTableCollectorsOnFinish( const std::vector>& collectors, @@ -143,7 +144,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableCFOptions& ioptions, - const std::string& meta_block_name, + const std::string& meta_block_name, BlockType block_type, BlockContents* contents, bool compression_type_missing = false, MemoryAllocator* memory_allocator = nullptr); diff --git a/table/mock_table.cc b/table/mock_table.cc index 9b250604803..50cbb202475 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -6,11 +6,11 @@ #include "table/mock_table.h" #include "db/dbformat.h" +#include "file/random_access_file_reader.h" #include "port/port.h" #include "rocksdb/table_properties.h" #include "table/get_context.h" #include "util/coding.h" -#include "util/file_reader_writer.h" namespace rocksdb { namespace mock { @@ -21,12 +21,6 @@ const InternalKeyComparator icmp_(BytewiseComparator()); } // namespace -stl_wrappers::KVMap MakeMockFile( - std::vector> l) { - return stl_wrappers::KVMap(l.begin(), l.end(), - stl_wrappers::LessOfComparator(&icmp_)); -} - stl_wrappers::KVMap MakeMockFile( std::initializer_list> l) { return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_)); @@ -34,7 +28,8 @@ stl_wrappers::KVMap MakeMockFile( InternalIterator* MockTableReader::NewIterator( const ReadOptions&, const SliceTransform* /* prefix_extractor */, - Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/) { + Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/, + size_t /*compaction_readahead_size*/) { return new MockTableIterator(table_); } @@ -143,14 +138,6 @@ void MockTableFactory::AssertLatestFile( ParseInternalKey(Slice(key), &ikey); std::cout << ikey.DebugString(false) << " -> " << value << std::endl; } - std::cout << "Expected:" << std::endl; - for (const auto& kv : file_contents) { - ParsedInternalKey ikey; - std::string key, value; - std::tie(key, value) = kv; - ParseInternalKey(Slice(key), &ikey); - std::cout << ikey.DebugString(false) << " -> " << value << std::endl; - } FAIL(); } } diff --git a/table/mock_table.h b/table/mock_table.h index 5bca14644d8..81d178810f2 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -12,24 +12,22 @@ #include #include -#include "util/kv_map.h" #include "port/port.h" #include "rocksdb/comparator.h" #include "rocksdb/table.h" #include "table/internal_iterator.h" #include "table/table_builder.h" #include "table/table_reader.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/kv_map.h" #include "util/mutexlock.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { namespace mock { stl_wrappers::KVMap MakeMockFile( std::initializer_list> l = {}); -stl_wrappers::KVMap MakeMockFile( - std::vector> l); struct MockTableFileSystem { port::Mutex mutex; @@ -42,17 +40,25 @@ class MockTableReader : public TableReader { InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false) override; + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0) override; Status Get(const ReadOptions& readOptions, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; } + uint64_t ApproximateOffsetOf(const Slice& /*key*/, + TableReaderCaller /*caller*/) override { + return 0; + } - virtual size_t ApproximateMemoryUsage() const override { return 0; } + uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + size_t ApproximateMemoryUsage() const override { return 0; } void SetupForCompaction() override {} @@ -186,12 +192,6 @@ class MockTableFactory : public TableFactory { // contents are equal to file_contents void AssertSingleFile(const stl_wrappers::KVMap& file_contents); void AssertLatestFile(const stl_wrappers::KVMap& file_contents); - stl_wrappers::KVMap output() { - assert(!file_system_.files.empty()); - auto latest = file_system_.files.end(); - --latest; - return latest->second; - } private: uint32_t GetAndWriteNextID(WritableFileWriter* file) const; diff --git a/table/multiget_context.h b/table/multiget_context.h index d3a8d09463b..8b5b607b3bf 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include "db/lookup_key.h" #include "db/merge_context.h" @@ -21,22 +22,23 @@ struct KeyContext { LookupKey* lkey; Slice ukey; Slice ikey; + ColumnFamilyHandle* column_family; Status* s; MergeContext merge_context; SequenceNumber max_covering_tombstone_seq; bool key_exists; - SequenceNumber seq; void* cb_arg; PinnableSlice* value; GetContext* get_context; - KeyContext(const Slice& user_key, PinnableSlice* val, Status* stat) + KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key, + PinnableSlice* val, Status* stat) : key(&user_key), lkey(nullptr), + column_family(col_family), s(stat), max_covering_tombstone_seq(0), key_exists(false), - seq(0), cb_arg(nullptr), value(val), get_context(nullptr) {} @@ -87,10 +89,9 @@ class MultiGetContext { // htat need to be performed static const int MAX_BATCH_SIZE = 32; - MultiGetContext(KeyContext** sorted_keys, size_t num_keys, - SequenceNumber snapshot) - : sorted_keys_(sorted_keys), - num_keys_(num_keys), + MultiGetContext(autovector* sorted_keys, + size_t begin, size_t num_keys, SequenceNumber snapshot) + : num_keys_(num_keys), value_mask_(0), lookup_key_ptr_(reinterpret_cast(lookup_key_stack_buf)) { int index = 0; @@ -102,6 +103,8 @@ class MultiGetContext { } for (size_t iter = 0; iter != num_keys_; ++iter) { + // autovector may not be contiguous storage, so make a copy + sorted_keys_[iter] = (*sorted_keys)[begin + iter]; sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[index]) LookupKey(*sorted_keys_[iter]->key, snapshot); sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key(); @@ -120,10 +123,10 @@ class MultiGetContext { static const int MAX_LOOKUP_KEYS_ON_STACK = 16; alignas(alignof(LookupKey)) char lookup_key_stack_buf[sizeof(LookupKey) * MAX_LOOKUP_KEYS_ON_STACK]; - KeyContext** sorted_keys_; + std::array sorted_keys_; size_t num_keys_; uint64_t value_mask_; - std::unique_ptr lookup_key_heap_buf; + std::unique_ptr lookup_key_heap_buf; LookupKey* lookup_key_ptr_; public: @@ -231,6 +234,16 @@ class MultiGetContext { return ctx_->value_mask_ & (1ull << iter.index_); } + uint64_t KeysLeft() { + uint64_t new_val = skip_mask_ | ctx_->value_mask_; + uint64_t count = 0; + while (new_val) { + new_val = new_val & (new_val - 1); + count++; + } + return end_ - count; + } + private: friend MultiGetContext; MultiGetContext* ctx_; diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc deleted file mode 100644 index aab0f5509b9..00000000000 --- a/table/partitioned_filter_block.cc +++ /dev/null @@ -1,355 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include "table/partitioned_filter_block.h" - -#ifdef ROCKSDB_MALLOC_USABLE_SIZE -#ifdef OS_FREEBSD -#include -#else -#include -#endif -#endif -#include - -#include "monitoring/perf_context_imp.h" -#include "port/port.h" -#include "rocksdb/filter_policy.h" -#include "table/block.h" -#include "table/block_based_table_reader.h" -#include "util/coding.h" - -namespace rocksdb { - -PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( - const SliceTransform* prefix_extractor, bool whole_key_filtering, - FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, - const bool use_value_delta_encoding, - PartitionedIndexBuilder* const p_index_builder, - const uint32_t partition_size) - : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering, - filter_bits_builder), - index_on_filter_block_builder_(index_block_restart_interval, - true /*use_delta_encoding*/, - use_value_delta_encoding), - index_on_filter_block_builder_without_seq_(index_block_restart_interval, - true /*use_delta_encoding*/, - use_value_delta_encoding), - p_index_builder_(p_index_builder), - filters_in_partition_(0), - num_added_(0) { - filters_per_partition_ = - filter_bits_builder_->CalculateNumEntry(partition_size); -} - -PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {} - -void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() { - // Use == to send the request only once - if (filters_in_partition_ == filters_per_partition_) { - // Currently only index builder is in charge of cutting a partition. We keep - // requesting until it is granted. - p_index_builder_->RequestPartitionCut(); - } - if (!p_index_builder_->ShouldCutFilterBlock()) { - return; - } - filter_gc.push_back(std::unique_ptr(nullptr)); - Slice filter = filter_bits_builder_->Finish(&filter_gc.back()); - std::string& index_key = p_index_builder_->GetPartitionKey(); - filters.push_back({index_key, filter}); - filters_in_partition_ = 0; - Reset(); -} - -void PartitionedFilterBlockBuilder::AddKey(const Slice& key) { - MaybeCutAFilterBlock(); - filter_bits_builder_->AddKey(key); - filters_in_partition_++; - num_added_++; -} - -Slice PartitionedFilterBlockBuilder::Finish( - const BlockHandle& last_partition_block_handle, Status* status) { - if (finishing_filters == true) { - // Record the handle of the last written filter block in the index - FilterEntry& last_entry = filters.front(); - std::string handle_encoding; - last_partition_block_handle.EncodeTo(&handle_encoding); - std::string handle_delta_encoding; - PutVarsignedint64( - &handle_delta_encoding, - last_partition_block_handle.size() - last_encoded_handle_.size()); - last_encoded_handle_ = last_partition_block_handle; - const Slice handle_delta_encoding_slice(handle_delta_encoding); - index_on_filter_block_builder_.Add(last_entry.key, handle_encoding, - &handle_delta_encoding_slice); - if (!p_index_builder_->seperator_is_key_plus_seq()) { - index_on_filter_block_builder_without_seq_.Add( - ExtractUserKey(last_entry.key), handle_encoding, - &handle_delta_encoding_slice); - } - filters.pop_front(); - } else { - MaybeCutAFilterBlock(); - } - // If there is no filter partition left, then return the index on filter - // partitions - if (UNLIKELY(filters.empty())) { - *status = Status::OK(); - if (finishing_filters) { - if (p_index_builder_->seperator_is_key_plus_seq()) { - return index_on_filter_block_builder_.Finish(); - } else { - return index_on_filter_block_builder_without_seq_.Finish(); - } - } else { - // This is the rare case where no key was added to the filter - return Slice(); - } - } else { - // Return the next filter partition in line and set Incomplete() status to - // indicate we expect more calls to Finish - *status = Status::Incomplete(); - finishing_filters = true; - return filters.front().filter; - } -} - -PartitionedFilterBlockReader::PartitionedFilterBlockReader( - const SliceTransform* prefix_extractor, bool _whole_key_filtering, - BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/, - Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq, - const bool index_value_is_full) - : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering), - prefix_extractor_(prefix_extractor), - comparator_(comparator), - table_(table), - index_key_includes_seq_(index_key_includes_seq), - index_value_is_full_(index_value_is_full) { - idx_on_fltr_blk_.reset(new Block(std::move(contents), - kDisableGlobalSequenceNumber, - 0 /* read_amp_bytes_per_bit */, stats)); -} - -PartitionedFilterBlockReader::~PartitionedFilterBlockReader() { - // TODO(myabandeh): if instead of filter object we store only the blocks in - // block cache, then we don't have to manually earse them from block cache - // here. - auto block_cache = table_->rep_->table_options.block_cache.get(); - if (UNLIKELY(block_cache == nullptr)) { - return; - } - char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - IndexBlockIter biter; - BlockHandle handle; - Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( - &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); - biter.SeekToFirst(); - for (; biter.Valid(); biter.Next()) { - handle = biter.value(); - auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix, - table_->rep_->cache_key_prefix_size, - handle, cache_key); - block_cache->Erase(key); - } -} - -bool PartitionedFilterBlockReader::KeyMayMatch( - const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr) { - assert(const_ikey_ptr != nullptr); - assert(block_offset == kNotValid); - if (!whole_key_filtering_) { - return true; - } - if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) { - return true; - } - auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr); - if (UNLIKELY(filter_handle.size() == 0)) { // key is out of range - return false; - } - bool cached = false; - auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, - &cached, prefix_extractor); - if (UNLIKELY(!filter_partition.value)) { - return true; - } - auto res = filter_partition.value->KeyMayMatch(key, prefix_extractor, - block_offset, no_io); - if (cached) { - return res; - } - if (LIKELY(filter_partition.IsSet())) { - filter_partition.Release(table_->rep_->table_options.block_cache.get()); - } else { - delete filter_partition.value; - } - return res; -} - -bool PartitionedFilterBlockReader::PrefixMayMatch( - const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset, const bool no_io, - const Slice* const const_ikey_ptr) { -#ifdef NDEBUG - (void)block_offset; -#endif - assert(const_ikey_ptr != nullptr); - assert(block_offset == kNotValid); - if (!prefix_extractor_ && !prefix_extractor) { - return true; - } - if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) { - return true; - } - auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr); - if (UNLIKELY(filter_handle.size() == 0)) { // prefix is out of range - return false; - } - bool cached = false; - auto filter_partition = - GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io, - &cached, prefix_extractor); - if (UNLIKELY(!filter_partition.value)) { - return true; - } - auto res = filter_partition.value->PrefixMayMatch(prefix, prefix_extractor, - kNotValid, no_io); - if (cached) { - return res; - } - if (LIKELY(filter_partition.IsSet())) { - filter_partition.Release(table_->rep_->table_options.block_cache.get()); - } else { - delete filter_partition.value; - } - return res; -} - -BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( - const Slice& entry) { - IndexBlockIter iter; - Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( - &comparator_, comparator_.user_comparator(), &iter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); - iter.Seek(entry); - if (UNLIKELY(!iter.Valid())) { - return BlockHandle(0, 0); - } - assert(iter.Valid()); - BlockHandle fltr_blk_handle = iter.value(); - return fltr_blk_handle; -} - -BlockBasedTable::CachableEntry -PartitionedFilterBlockReader::GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle, - const bool no_io, bool* cached, const SliceTransform* prefix_extractor) { - const bool is_a_filter_partition = true; - auto block_cache = table_->rep_->table_options.block_cache.get(); - if (LIKELY(block_cache != nullptr)) { - if (filter_map_.size() != 0) { - auto iter = filter_map_.find(fltr_blk_handle.offset()); - // This is a possible scenario since block cache might not have had space - // for the partition - if (iter != filter_map_.end()) { - PERF_COUNTER_ADD(block_cache_hit_count, 1); - RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT); - RecordTick(statistics(), BLOCK_CACHE_HIT); - RecordTick(statistics(), BLOCK_CACHE_BYTES_READ, - block_cache->GetUsage(iter->second.cache_handle)); - *cached = true; - return iter->second; - } - } - return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle, - is_a_filter_partition, no_io, - /* get_context */ nullptr, prefix_extractor); - } else { - auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle, - is_a_filter_partition, prefix_extractor); - return {filter, nullptr}; - } -} - -size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { - size_t usage = idx_on_fltr_blk_->usable_size(); -#ifdef ROCKSDB_MALLOC_USABLE_SIZE - usage += malloc_usable_size((void*)this); -#else - usage += sizeof(*this); -#endif // ROCKSDB_MALLOC_USABLE_SIZE - return usage; - // TODO(myabandeh): better estimation for filter_map_ size -} - -// Release the cached entry and decrement its ref count. -void ReleaseFilterCachedEntry(void* arg, void* h) { - Cache* cache = reinterpret_cast(arg); - Cache::Handle* handle = reinterpret_cast(h); - cache->Release(handle); -} - -// TODO(myabandeh): merge this with the same function in IndexReader -void PartitionedFilterBlockReader::CacheDependencies( - bool pin, const SliceTransform* prefix_extractor) { - // Before read partitions, prefetch them to avoid lots of IOs - auto rep = table_->rep_; - IndexBlockIter biter; - Statistics* kNullStats = nullptr; - idx_on_fltr_blk_->NewIterator( - &comparator_, comparator_.user_comparator(), &biter, kNullStats, true, - index_key_includes_seq_, index_value_is_full_); - // Index partitions are assumed to be consecuitive. Prefetch them all. - // Read the first block offset - biter.SeekToFirst(); - BlockHandle handle = biter.value(); - uint64_t prefetch_off = handle.offset(); - - // Read the last block's offset - biter.SeekToLast(); - handle = biter.value(); - uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize; - uint64_t prefetch_len = last_off - prefetch_off; - std::unique_ptr prefetch_buffer; - auto& file = table_->rep_->file; - prefetch_buffer.reset(new FilePrefetchBuffer()); - Status s; - s = prefetch_buffer->Prefetch(file.get(), prefetch_off, - static_cast(prefetch_len)); - - // After prefetch, read the partitions one by one - biter.SeekToFirst(); - Cache* block_cache = rep->table_options.block_cache.get(); - for (; biter.Valid(); biter.Next()) { - handle = biter.value(); - const bool no_io = true; - const bool is_a_filter_partition = true; - auto filter = table_->GetFilter( - prefetch_buffer.get(), handle, is_a_filter_partition, !no_io, - /* get_context */ nullptr, prefix_extractor); - if (LIKELY(filter.IsSet())) { - if (pin) { - filter_map_[handle.offset()] = std::move(filter); - RegisterCleanup(&ReleaseFilterCachedEntry, block_cache, - filter.cache_handle); - } else { - block_cache->Release(filter.cache_handle); - } - } else { - delete filter.value; - } - } -} - -} // namespace rocksdb diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h deleted file mode 100644 index 5d55da54493..00000000000 --- a/table/partitioned_filter_block.h +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once - -#include -#include -#include -#include "db/dbformat.h" -#include "rocksdb/options.h" -#include "rocksdb/slice.h" -#include "rocksdb/slice_transform.h" - -#include "table/block.h" -#include "table/block_based_table_reader.h" -#include "table/full_filter_block.h" -#include "table/index_builder.h" -#include "util/autovector.h" - -namespace rocksdb { - -class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { - public: - explicit PartitionedFilterBlockBuilder( - const SliceTransform* prefix_extractor, bool whole_key_filtering, - FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, - const bool use_value_delta_encoding, - PartitionedIndexBuilder* const p_index_builder, - const uint32_t partition_size); - - virtual ~PartitionedFilterBlockBuilder(); - - void AddKey(const Slice& key) override; - - size_t NumAdded() const override { return num_added_; } - - virtual Slice Finish(const BlockHandle& last_partition_block_handle, - Status* status) override; - - private: - // Filter data - BlockBuilder index_on_filter_block_builder_; // top-level index builder - BlockBuilder - index_on_filter_block_builder_without_seq_; // same for user keys - struct FilterEntry { - std::string key; - Slice filter; - }; - std::list filters; // list of partitioned indexes and their keys - std::unique_ptr value; - std::vector> filter_gc; - bool finishing_filters = - false; // true if Finish is called once but not complete yet. - // The policy of when cut a filter block and Finish it - void MaybeCutAFilterBlock(); - // Currently we keep the same number of partitions for filters and indexes. - // This would allow for some potentioal optimizations in future. If such - // optimizations did not realize we can use different number of partitions and - // eliminate p_index_builder_ - PartitionedIndexBuilder* const p_index_builder_; - // The desired number of filters per partition - uint32_t filters_per_partition_; - // The current number of filters in the last partition - uint32_t filters_in_partition_; - // Number of keys added - size_t num_added_; - BlockHandle last_encoded_handle_; -}; - -class PartitionedFilterBlockReader : public FilterBlockReader, - public Cleanable { - public: - explicit PartitionedFilterBlockReader( - const SliceTransform* prefix_extractor, bool whole_key_filtering, - BlockContents&& contents, FilterBitsReader* filter_bits_reader, - Statistics* stats, const InternalKeyComparator comparator, - const BlockBasedTable* table, const bool index_key_includes_seq, - const bool index_value_is_full); - virtual ~PartitionedFilterBlockReader(); - - virtual bool IsBlockBased() override { return false; } - virtual bool KeyMayMatch( - const Slice& key, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - virtual bool PrefixMayMatch( - const Slice& prefix, const SliceTransform* prefix_extractor, - uint64_t block_offset = kNotValid, const bool no_io = false, - const Slice* const const_ikey_ptr = nullptr) override; - virtual size_t ApproximateMemoryUsage() const override; - - private: - BlockHandle GetFilterPartitionHandle(const Slice& entry); - BlockBasedTable::CachableEntry GetFilterPartition( - FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle, - const bool no_io, bool* cached, - const SliceTransform* prefix_extractor = nullptr); - virtual void CacheDependencies( - bool bin, const SliceTransform* prefix_extractor) override; - - const SliceTransform* prefix_extractor_; - std::unique_ptr idx_on_fltr_blk_; - const InternalKeyComparator comparator_; - const BlockBasedTable* table_; - const bool index_key_includes_seq_; - const bool index_value_is_full_; - std::unordered_map> - filter_map_; -}; - -} // namespace rocksdb diff --git a/table/persistent_cache_helper.cc b/table/persistent_cache_helper.cc index 4e90697a6e5..8431f13db37 100644 --- a/table/persistent_cache_helper.cc +++ b/table/persistent_cache_helper.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "table/persistent_cache_helper.h" -#include "table/block_based_table_reader.h" +#include "table/block_based/block_based_table_reader.h" #include "table/format.h" namespace rocksdb { diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc new file mode 100644 index 00000000000..0de541b5685 --- /dev/null +++ b/table/plain/plain_table_bloom.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/plain/plain_table_bloom.h" + +#include +#include +#include "util/dynamic_bloom.h" + +#include "memory/allocator.h" + +namespace rocksdb { + +namespace { + +uint32_t GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_blocks = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_blocks an odd number to make sure more bits are involved + // when determining which block. + if (num_blocks % 2 == 0) { + num_blocks++; + } + + return num_blocks * (CACHE_LINE_SIZE * 8); +} +} // namespace + +PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes) + : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {} + +void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits, + uint32_t num_blocks) { + data_ = raw_data; + kTotalBits = total_bits; + kNumBlocks = num_blocks; +} + +void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, + size_t huge_page_tlb_size, + Logger* logger) { + kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits) + : (total_bits + 7) / 8 * 8; + kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0; + + assert(kNumBlocks > 0 || kTotalBits > 0); + assert(kNumProbes > 0); + + uint32_t sz = kTotalBits / 8; + if (kNumBlocks > 0) { + sz += CACHE_LINE_SIZE - 1; + } + assert(allocator); + + char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger); + memset(raw, 0, sz); + auto cache_line_offset = reinterpret_cast(raw) % CACHE_LINE_SIZE; + if (kNumBlocks > 0 && cache_line_offset > 0) { + raw += CACHE_LINE_SIZE - cache_line_offset; + } + data_ = raw; +} + +void BloomBlockBuilder::AddKeysHashes( + const std::vector& keys_hashes) { + for (auto hash : keys_hashes) { + bloom_.AddHash(hash); + } +} + +Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } + +const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; +} // namespace rocksdb diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h new file mode 100644 index 00000000000..8da256b3bb8 --- /dev/null +++ b/table/plain/plain_table_bloom.h @@ -0,0 +1,135 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/slice.h" + +#include "port/port.h" +#include "util/bloom_impl.h" +#include "util/hash.h" + +#include "third-party/folly/folly/ConstexprMath.h" + +#include + +namespace rocksdb { +class Slice; +class Allocator; +class Logger; + +// A legacy Bloom filter implementation used by Plain Table db format, for +// schema backward compatibility. Not for use in new filter applications. +class PlainTableBloomV1 { + public: + // allocator: pass allocator to bloom filter, hence trace the usage of memory + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // locality: If positive, optimize for cache line locality, 0 otherwise. + // hash_func: customized hash function + // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB + // within this page size. Need to reserve huge pages for + // it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + explicit PlainTableBloomV1(uint32_t num_probes = 6); + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger); + + ~PlainTableBloomV1() {} + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash) const; + + void Prefetch(uint32_t hash); + + uint32_t GetNumBlocks() const { return kNumBlocks; } + + Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); } + + void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0); + + uint32_t GetTotalBits() const { return kTotalBits; } + + bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; } + + private: + uint32_t kTotalBits; + uint32_t kNumBlocks; + const uint32_t kNumProbes; + + char* data_; + + static constexpr int LOG2_CACHE_LINE_SIZE = + folly::constexpr_log2(CACHE_LINE_SIZE); +}; + +#if defined(_MSC_VER) +#pragma warning(push) +// local variable is initialized but not referenced +#pragma warning(disable : 4189) +#endif +inline void PlainTableBloomV1::Prefetch(uint32_t h) { + if (kNumBlocks != 0) { + uint32_t ignored; + LegacyLocalityBloomImpl::PrepareHashMayMatch( + h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE); + } +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const { + assert(IsInitialized()); + if (kNumBlocks != 0) { + return LegacyLocalityBloomImpl::HashMayMatch( + h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE); + } else { + return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes, + data_); + } +} + +inline void PlainTableBloomV1::AddHash(uint32_t h) { + assert(IsInitialized()); + if (kNumBlocks != 0) { + LegacyLocalityBloomImpl::AddHash(h, kNumBlocks, kNumProbes, data_, + LOG2_CACHE_LINE_SIZE); + } else { + LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_); + } +} + +class BloomBlockBuilder { + public: + static const std::string kBloomBlock; + + explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {} + + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger) { + bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, + logger); + } + + uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } + + void AddKeysHashes(const std::vector& keys_hashes); + + Slice Finish(); + + private: + PlainTableBloomV1 bloom_; +}; + +}; // namespace rocksdb diff --git a/table/plain_table_builder.cc b/table/plain/plain_table_builder.cc similarity index 97% rename from table/plain_table_builder.cc rename to table/plain/plain_table_builder.cc index 453b6c768b5..696340525a7 100644 --- a/table/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "table/plain_table_builder.h" +#include "table/plain/plain_table_builder.h" #include @@ -12,21 +12,21 @@ #include #include +#include "db/dbformat.h" +#include "file/writable_file_writer.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "table/plain_table_factory.h" -#include "db/dbformat.h" -#include "table/block_builder.h" -#include "table/bloom_block.h" -#include "table/plain_table_index.h" +#include "table/block_based/block_builder.h" #include "table/format.h" #include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_index.h" #include "util/coding.h" #include "util/crc32c.h" -#include "util/file_reader_writer.h" #include "util/stop_watch.h" namespace rocksdb { diff --git a/table/plain_table_builder.h b/table/plain/plain_table_builder.h similarity index 93% rename from table/plain_table_builder.h rename to table/plain/plain_table_builder.h index ca0879a4e1d..f2cd6009eb3 100644 --- a/table/plain_table_builder.h +++ b/table/plain/plain_table_builder.h @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once + #ifndef ROCKSDB_LITE #include #include @@ -12,9 +13,9 @@ #include "rocksdb/status.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" -#include "table/bloom_block.h" -#include "table/plain_table_index.h" -#include "table/plain_table_key_coding.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_index.h" +#include "table/plain/plain_table_key_coding.h" #include "table/table_builder.h" namespace rocksdb { @@ -24,6 +25,9 @@ class BlockHandle; class WritableFile; class TableBuilder; +// The builder class of PlainTable. For description of PlainTable format +// See comments of class PlainTableFactory, where instances of +// PlainTableReader are created. class PlainTableBuilder: public TableBuilder { public: // Create a builder that will store the contents of the table it is @@ -41,6 +45,9 @@ class PlainTableBuilder: public TableBuilder { const std::string& column_family_name, uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, double hash_table_ratio = 0, bool store_index_in_file = false); + // No copying allowed + PlainTableBuilder(const PlainTableBuilder&) = delete; + void operator=(const PlainTableBuilder&) = delete; // REQUIRES: Either Finish() or Abandon() has been called. ~PlainTableBuilder(); @@ -127,10 +134,6 @@ class PlainTableBuilder: public TableBuilder { } bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } - - // No copying allowed - PlainTableBuilder(const PlainTableBuilder&) = delete; - void operator=(const PlainTableBuilder&) = delete; }; } // namespace rocksdb diff --git a/table/plain_table_factory.cc b/table/plain/plain_table_factory.cc similarity index 98% rename from table/plain_table_factory.cc rename to table/plain/plain_table_factory.cc index 0dccec55242..6c6905dab1f 100644 --- a/table/plain_table_factory.cc +++ b/table/plain/plain_table_factory.cc @@ -4,7 +4,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef ROCKSDB_LITE -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include #include @@ -12,8 +12,8 @@ #include "options/options_helper.h" #include "port/port.h" #include "rocksdb/convenience.h" -#include "table/plain_table_builder.h" -#include "table/plain_table_reader.h" +#include "table/plain/plain_table_builder.h" +#include "table/plain/plain_table_reader.h" #include "util/string_util.h" namespace rocksdb { diff --git a/table/plain_table_factory.h b/table/plain/plain_table_factory.h similarity index 93% rename from table/plain_table_factory.h rename to table/plain/plain_table_factory.h index dade1566096..1bd155f93e9 100644 --- a/table/plain_table_factory.h +++ b/table/plain/plain_table_factory.h @@ -24,7 +24,19 @@ class WritableFile; class Table; class TableBuilder; -// IndexedTable requires fixed length key, configured as a constructor +// PlainTableFactory is the entrance function to the PlainTable format of +// SST files. It returns instances PlainTableBuilder as the builder +// class and PlainTableReader as the reader class, where the format is +// actually implemented. +// +// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs. +// Data is not organized in blocks, which allows fast access. Because of +// following downsides +// 1. Data compression is not supported. +// 2. Data is not checksumed. +// it is not recommended to use this format on other type of file systems. +// +// PlainTable requires fixed length key, configured as a constructor // parameter of the factory class. Output file format: // +-------------+-----------------+ // | version | user_key_length | diff --git a/table/plain_table_index.cc b/table/plain/plain_table_index.cc similarity index 98% rename from table/plain_table_index.cc rename to table/plain/plain_table_index.cc index 43740923974..b4207f348cb 100644 --- a/table/plain_table_index.cc +++ b/table/plain/plain_table_index.cc @@ -5,13 +5,9 @@ #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif +#include -#include - -#include "table/plain_table_index.h" +#include "table/plain/plain_table_index.h" #include "util/coding.h" #include "util/hash.h" diff --git a/table/plain_table_index.h b/table/plain/plain_table_index.h similarity index 85% rename from table/plain_table_index.h rename to table/plain/plain_table_index.h index 360d998279a..c4bb272282c 100644 --- a/table/plain_table_index.h +++ b/table/plain/plain_table_index.h @@ -11,15 +11,19 @@ #include #include "db/dbformat.h" +#include "memory/arena.h" #include "monitoring/histogram.h" #include "options/cf_options.h" #include "rocksdb/options.h" -#include "util/arena.h" -#include "util/hash.h" -#include "util/murmurhash.h" namespace rocksdb { +// The file contains two classes PlainTableIndex and PlainTableIndexBuilder +// The two classes implement the index format of PlainTable. +// For descripton of PlainTable format, see comments of class +// PlainTableFactory +// +// // PlainTableIndex contains buckets size of index_size_, each is a // 32-bit integer. The lower 31 bits contain an offset value (explained below) // and the first bit of the integer indicates type of the offset. @@ -55,6 +59,10 @@ namespace rocksdb { // .... // record N file offset: fixedint32 // + +// The class loads the index block from a PlainTable SST file, and executes +// the index lookup. +// The class is used by PlainTableReader class. class PlainTableIndex { public: enum IndexSearchResult { @@ -72,11 +80,22 @@ class PlainTableIndex { index_(nullptr), sub_index_(nullptr) {} + // The function that executes the lookup the hash table. + // The hash key is `prefix_hash`. The function fills the hash bucket + // content in `bucket_value`, which is up to the caller to interpret. IndexSearchResult GetOffset(uint32_t prefix_hash, uint32_t* bucket_value) const; - Status InitFromRawData(Slice data); + // Initialize data from `index_data`, which points to raw data for + // index stored in the SST file. + Status InitFromRawData(Slice index_data); + // Decode the sub index for specific hash bucket. + // The `offset` is the value returned as `bucket_value` by GetOffset() + // and is only valid when the return value is `kSubindex`. + // The return value is the pointer to the starting address of the + // sub-index. `upper_bound` is filled with the value indicating how many + // entries the sub-index has. const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, uint32_t* upper_bound) const { const char* index_ptr = &sub_index_[offset]; @@ -106,9 +125,10 @@ class PlainTableIndex { // After calling Finish(), it returns Slice, which is usually // used either to initialize PlainTableIndex or // to save index to sst file. -// For more details about the index, please refer to: +// For more details about the index, please refer to: // https://github.com/facebook/rocksdb/wiki/PlainTable-Format // #wiki-in-memory-index-format +// The class is used by PlainTableBuilder class. class PlainTableIndexBuilder { public: PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, diff --git a/table/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc similarity index 99% rename from table/plain_table_key_coding.cc rename to table/plain/plain_table_key_coding.cc index 6f5ee9b4ad2..b70ce65e675 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -4,14 +4,14 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#include "table/plain_table_key_coding.h" +#include "table/plain/plain_table_key_coding.h" #include #include #include "db/dbformat.h" -#include "table/plain_table_reader.h" -#include "table/plain_table_factory.h" -#include "util/file_reader_writer.h" +#include "file/writable_file_writer.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_reader.h" namespace rocksdb { diff --git a/table/plain_table_key_coding.h b/table/plain/plain_table_key_coding.h similarity index 90% rename from table/plain_table_key_coding.h rename to table/plain/plain_table_key_coding.h index 9a27ad06b78..5f65d5a6560 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain/plain_table_key_coding.h @@ -4,13 +4,19 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once + #ifndef ROCKSDB_LITE #include -#include "rocksdb/slice.h" #include "db/dbformat.h" -#include "table/plain_table_reader.h" +#include "rocksdb/slice.h" +#include "table/plain/plain_table_reader.h" +// The file contains three helper classes of PlainTable format, +// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader. +// These classes issue the lowest level of operations of PlainTable. +// Actual data format of the key is documented in comments of class +// PlainTableFactory. namespace rocksdb { class WritableFile; @@ -18,8 +24,8 @@ struct ParsedInternalKey; struct PlainTableReaderFileInfo; enum PlainTableEntryType : unsigned char; -// Helper class to write out a key to an output file -// Actual data format of the key is documented in plain_table_factory.h +// Helper class for PlainTable format to write out a key to an output file +// The class is used in PlainTableBuilder. class PlainTableKeyEncoder { public: explicit PlainTableKeyEncoder(EncodingType encoding_type, @@ -53,6 +59,10 @@ class PlainTableKeyEncoder { IterKey pre_prefix_; }; +// The class does raw file reads for PlainTableReader. +// It hides whether it is a mmap-read, or a non-mmap read. +// The class is implemented in a way to favor the performance of mmap case. +// The class is used by PlainTableReader. class PlainTableFileReader { public: explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) @@ -122,7 +132,7 @@ class PlainTableFileReader { }; // A helper class to decode keys from input buffer -// Actual data format of the key is documented in plain_table_factory.h +// The class is used by PlainTableBuilder. class PlainTableKeyDecoder { public: explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, diff --git a/table/plain_table_reader.cc b/table/plain/plain_table_reader.cc similarity index 89% rename from table/plain_table_reader.cc rename to table/plain/plain_table_reader.cc index f33afdefc38..15cd32d0b08 100644 --- a/table/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -5,7 +5,7 @@ #ifndef ROCKSDB_LITE -#include "table/plain_table_reader.h" +#include "table/plain/plain_table_reader.h" #include #include @@ -19,24 +19,23 @@ #include "rocksdb/options.h" #include "rocksdb/statistics.h" -#include "table/block.h" -#include "table/bloom_block.h" -#include "table/filter_block.h" +#include "table/block_based/block.h" +#include "table/block_based/filter_block.h" #include "table/format.h" +#include "table/get_context.h" #include "table/internal_iterator.h" #include "table/meta_blocks.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_key_coding.h" #include "table/two_level_iterator.h" -#include "table/plain_table_factory.h" -#include "table/plain_table_key_coding.h" -#include "table/get_context.h" +#include "memory/arena.h" #include "monitoring/histogram.h" #include "monitoring/perf_context_imp.h" -#include "util/arena.h" #include "util/coding.h" #include "util/dynamic_bloom.h" #include "util/hash.h" -#include "util/murmurhash.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -55,6 +54,10 @@ inline uint32_t GetFixed32Element(const char* base, size_t offset) { class PlainTableIterator : public InternalIterator { public: explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); + // No copying allowed + PlainTableIterator(const PlainTableIterator&) = delete; + void operator=(const Iterator&) = delete; + ~PlainTableIterator() override; bool Valid() const override; @@ -86,9 +89,6 @@ class PlainTableIterator : public InternalIterator { Slice key_; Slice value_; Status status_; - // No copying allowed - PlainTableIterator(const PlainTableIterator&) = delete; - void operator=(const Iterator&) = delete; }; extern const uint64_t kPlainTableMagicNumber; @@ -127,10 +127,11 @@ Status PlainTableReader::Open( return Status::NotSupported("File is too large for PlainTableReader!"); } - TableProperties* props = nullptr; + TableProperties* props_ptr = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - ioptions, &props, + ioptions, &props_ptr, true /* compression_type_missing */); + std::shared_ptr props(props_ptr); if (!s.ok()) { return s; } @@ -164,7 +165,7 @@ Status PlainTableReader::Open( std::unique_ptr new_reader(new PlainTableReader( ioptions, std::move(file), env_options, internal_comparator, - encoding_type, file_size, props, prefix_extractor)); + encoding_type, file_size, props.get(), prefix_extractor)); s = new_reader->MmapDataIfNeeded(); if (!s.ok()) { @@ -172,8 +173,9 @@ Status PlainTableReader::Open( } if (!full_scan_mode) { - s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio, - index_sparseness, huge_page_tlb_size); + s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key, + hash_table_ratio, index_sparseness, + huge_page_tlb_size); if (!s.ok()) { return s; } @@ -182,6 +184,8 @@ Status PlainTableReader::Open( // can be used. new_reader->full_scan_mode_ = true; } + // PopulateIndex can add to the props, so don't store them until now + new_reader->table_properties_ = props; if (immortal_table && new_reader->file_info_.is_mmap_mode) { new_reader->dummy_cleanable_.reset(new Cleanable()); @@ -196,7 +200,11 @@ void PlainTableReader::SetupForCompaction() { InternalIterator* PlainTableReader::NewIterator( const ReadOptions& options, const SliceTransform* /* prefix_extractor */, - Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) { + Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, + size_t /*compaction_readahead_size*/) { + // Not necessarily used here, but make sure this has been initialized + assert(table_properties_); + bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek; if (arena == nullptr) { return new PlainTableIterator(this, use_prefix_seek); @@ -258,23 +266,19 @@ Status PlainTableReader::PopulateIndexRecordList( return s; } -void PlainTableReader::AllocateAndFillBloom( - int bloom_bits_per_key, int num_prefixes, size_t huge_page_tlb_size, - std::vector* prefix_hashes) { - if (!IsTotalOrderMode()) { - uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; - if (bloom_total_bits > 0) { - enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, - huge_page_tlb_size, ioptions_.info_log); - FillBloom(prefix_hashes); - } +void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys, + size_t huge_page_tlb_size) { + uint32_t bloom_total_bits = num_keys * bloom_bits_per_key; + if (bloom_total_bits > 0) { + enable_bloom_ = true; + bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); } } -void PlainTableReader::FillBloom(std::vector* prefix_hashes) { +void PlainTableReader::FillBloom(const std::vector& prefix_hashes) { assert(bloom_.IsInitialized()); - for (auto prefix_hash : *prefix_hashes) { + for (const auto prefix_hash : prefix_hashes) { bloom_.AddHash(prefix_hash); } } @@ -293,13 +297,12 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, size_t index_sparseness, size_t huge_page_tlb_size) { assert(props != nullptr); - table_properties_.reset(props); BlockContents index_block_contents; Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, PlainTableIndexBuilder::kPlainTableIndexBlock, - &index_block_contents, + BlockType::kIndex, &index_block_contents, true /* compression_type_missing */); bool index_in_file = s.ok(); @@ -310,7 +313,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, if (index_in_file) { s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, - BloomBlockBuilder::kBloomBlock, &bloom_block_contents, + BloomBlockBuilder::kBloomBlock, BlockType::kFilter, + &bloom_block_contents, true /* compression_type_missing */); bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; } @@ -351,14 +355,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, if (!index_in_file) { // Allocate bloom filter here for total order mode. if (IsTotalOrderMode()) { - uint32_t num_bloom_bits = - static_cast(table_properties_->num_entries) * - bloom_bits_per_key; - if (num_bloom_bits > 0) { - enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, - huge_page_tlb_size, ioptions_.info_log); - } + AllocateBloom(bloom_bits_per_key, + static_cast(props->num_entries), + huge_page_tlb_size); } } else if (bloom_in_file) { enable_bloom_ = true; @@ -373,10 +372,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, } } // cast away const qualifier, because bloom_ won't be changed - bloom_.SetRawData( - const_cast( - reinterpret_cast(bloom_block->data())), - static_cast(bloom_block->size()) * 8, num_blocks); + bloom_.SetRawData(const_cast(bloom_block->data()), + static_cast(bloom_block->size()) * 8, + num_blocks); } else { // Index in file but no bloom in file. Disable bloom filter in this case. enable_bloom_ = false; @@ -389,6 +387,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, std::vector prefix_hashes; if (!index_in_file) { + // Populates _bloom if enabled (total order mode) s = PopulateIndexRecordList(&index_builder, &prefix_hashes); if (!s.ok()) { return s; @@ -401,10 +400,15 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, } if (!index_in_file) { - // Calculated bloom filter size and allocate memory for - // bloom filter based on the number of prefixes, then fill it. - AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(), - huge_page_tlb_size, &prefix_hashes); + if (!IsTotalOrderMode()) { + // Calculated bloom filter size and allocate memory for + // bloom filter based on the number of prefixes, then fill it. + AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(), + huge_page_tlb_size); + if (enable_bloom_) { + FillBloom(prefix_hashes); + } + } } // Fill two table properties. @@ -613,7 +617,14 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, return Status::OK(); } -uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) { +uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, + TableReaderCaller /*caller*/) { + return 0; +} + +uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/, + const Slice& /*end*/, + TableReaderCaller /*caller*/) { return 0; } diff --git a/table/plain_table_reader.h b/table/plain/plain_table_reader.h similarity index 85% rename from table/plain_table_reader.h rename to table/plain/plain_table_reader.h index 14760f20a57..c956913a04f 100644 --- a/table/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -13,17 +13,17 @@ #include #include "db/dbformat.h" +#include "file/random_access_file_reader.h" +#include "memory/arena.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" +#include "table/plain/plain_table_bloom.h" +#include "table/plain/plain_table_factory.h" +#include "table/plain/plain_table_index.h" #include "table/table_reader.h" -#include "table/plain_table_factory.h" -#include "table/plain_table_index.h" -#include "util/arena.h" -#include "util/dynamic_bloom.h" -#include "util/file_reader_writer.h" namespace rocksdb { @@ -56,16 +56,17 @@ struct PlainTableReaderFileInfo { file(std::move(_file)) {} }; +// The reader class of PlainTable. For description of PlainTable format +// See comments of class PlainTableFactory, where instances of +// PlainTableReader are created. +class PlainTableReader: public TableReader { + public: // Based on following output file format shown in plain_table_factory.h -// When opening the output file, IndexedTableReader creates a hash table -// from key prefixes to offset of the output file. IndexedTable will decide +// When opening the output file, PlainTableReader creates a hash table +// from key prefixes to offset of the output file. PlainTable will decide // whether it points to the data offset of the first key with the key prefix // or the offset of it. If there are too many keys share this prefix, it will // create a binary search-able index from the suffix to offset on disk. -// -// The implementation of IndexedTableReader requires output file is mmaped -class PlainTableReader: public TableReader { - public: static Status Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, @@ -76,11 +77,14 @@ class PlainTableReader: public TableReader { bool full_scan_mode, const bool immortal_table = false, const SliceTransform* prefix_extractor = nullptr); + // Returns new iterator over table contents + // compaction_readahead_size: its value will only be used if for_compaction = + // true InternalIterator* NewIterator(const ReadOptions&, const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false) override; + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0) override; void Prepare(const Slice& target) override; @@ -88,7 +92,11 @@ class PlainTableReader: public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& key) override; + uint64_t ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) override; + + uint64_t ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) override; uint32_t GetIndexSize() const { return index_.GetIndexSize(); } void SetupForCompaction() override; @@ -148,7 +156,7 @@ class PlainTableReader: public TableReader { // Bloom filter is used to rule out non-existent key bool enable_bloom_; - DynamicBloom bloom_; + PlainTableBloomV1 bloom_; PlainTableReaderFileInfo file_info_; Arena arena_; CacheAllocationPtr index_block_alloc_; @@ -157,7 +165,9 @@ class PlainTableReader: public TableReader { const ImmutableCFOptions& ioptions_; std::unique_ptr dummy_cleanable_; uint64_t file_size_; + protected: // for testing std::shared_ptr table_properties_; + private: bool IsFixedLength() const { return user_key_len_ != kPlainTableVariableLength; @@ -202,12 +212,11 @@ class PlainTableReader: public TableReader { Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder, std::vector* prefix_hashes); - // Internal helper function to allocate memory for bloom filter and fill it - void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes, - size_t huge_page_tlb_size, - std::vector* prefix_hashes); + // Internal helper function to allocate memory for bloom filter + void AllocateBloom(int bloom_bits_per_key, int num_prefixes, + size_t huge_page_tlb_size); - void FillBloom(std::vector* prefix_hashes); + void FillBloom(const std::vector& prefix_hashes); // Read the key and value at `offset` to parameters for keys, the and // `seekable`. diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index 54408bb50e9..48db1d8b41e 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -9,11 +9,11 @@ #include "db/db_iter.h" #include "db/dbformat.h" +#include "file/random_access_file_reader.h" #include "options/cf_options.h" #include "table/get_context.h" #include "table/table_builder.h" #include "table/table_reader.h" -#include "util/file_reader_writer.h" namespace rocksdb { @@ -65,8 +65,9 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& options) { auto sequence = options.snapshot != nullptr ? options.snapshot->GetSequenceNumber() : kMaxSequenceNumber; - auto internal_iter = - r->table_reader->NewIterator(options, r->moptions.prefix_extractor.get()); + auto internal_iter = r->table_reader->NewIterator( + options, r->moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kSSTFileReader); return NewDBIterator(r->options.env, options, r->ioptions, r->moptions, r->ioptions.user_comparator, internal_iter, sequence, r->moptions.max_sequential_skip_in_iterations, @@ -78,8 +79,9 @@ std::shared_ptr SstFileReader::GetTableProperties() return rep_->table_reader->GetTableProperties(); } -Status SstFileReader::VerifyChecksum() { - return rep_->table_reader->VerifyChecksum(); +Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) { + return rep_->table_reader->VerifyChecksum(read_options, + TableReaderCaller::kSSTFileReader); } } // namespace rocksdb diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc index 51bc975af00..dd7a5101677 100644 --- a/table/sst_file_reader_test.cc +++ b/table/sst_file_reader_test.cc @@ -5,14 +5,14 @@ #ifndef ROCKSDB_LITE -#include +#include #include "rocksdb/db.h" #include "rocksdb/sst_file_reader.h" #include "rocksdb/sst_file_writer.h" #include "table/sst_file_writer_collectors.h" -#include "util/testharness.h" -#include "util/testutil.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index b9a7273e07d..dc2c589f21a 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -6,12 +6,13 @@ #include "rocksdb/sst_file_writer.h" #include + #include "db/dbformat.h" +#include "file/writable_file_writer.h" #include "rocksdb/table.h" -#include "table/block_based_table_builder.h" +#include "table/block_based/block_based_table_builder.h" #include "table/sst_file_writer_collectors.h" -#include "util/file_reader_writer.h" -#include "util/sync_point.h" +#include "test_util/sync_point.h" namespace rocksdb { diff --git a/table/table_builder.h b/table/table_builder.h index 21df978c3eb..4a4b19b626c 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -15,10 +15,11 @@ #include #include "db/dbformat.h" #include "db/table_properties_collector.h" +#include "file/writable_file_writer.h" #include "options/cf_options.h" #include "rocksdb/options.h" #include "rocksdb/table_properties.h" -#include "util/file_reader_writer.h" +#include "trace_replay/block_cache_tracer.h" namespace rocksdb { @@ -32,10 +33,12 @@ struct TableReaderOptions { const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, bool _skip_filters = false, bool _immortal = false, - int _level = -1) + int _level = -1, + BlockCacheTracer* const _block_cache_tracer = nullptr) : TableReaderOptions(_ioptions, _prefix_extractor, _env_options, _internal_comparator, _skip_filters, _immortal, - _level, 0 /* _largest_seqno */) {} + _level, 0 /* _largest_seqno */, + _block_cache_tracer) {} // @param skip_filters Disables loading/accessing the filter block TableReaderOptions(const ImmutableCFOptions& _ioptions, @@ -43,7 +46,8 @@ struct TableReaderOptions { const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, bool _skip_filters, bool _immortal, int _level, - SequenceNumber _largest_seqno) + SequenceNumber _largest_seqno, + BlockCacheTracer* const _block_cache_tracer) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), @@ -51,7 +55,8 @@ struct TableReaderOptions { skip_filters(_skip_filters), immortal(_immortal), level(_level), - largest_seqno(_largest_seqno) {} + largest_seqno(_largest_seqno), + block_cache_tracer(_block_cache_tracer) {} const ImmutableCFOptions& ioptions; const SliceTransform* prefix_extractor; @@ -65,6 +70,7 @@ struct TableReaderOptions { int level; // largest seqno in the table SequenceNumber largest_seqno; + BlockCacheTracer* const block_cache_tracer; }; struct TableBuilderOptions { diff --git a/table/table_properties.cc b/table/table_properties.cc index 8cfa2619591..6e481798c35 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -4,10 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). #include "rocksdb/table_properties.h" + #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" -#include "table/block.h" +#include "table/block_based/block.h" #include "table/internal_iterator.h" #include "table/table_properties_internal.h" #include "util/string_util.h" diff --git a/table/table_reader.h b/table/table_reader.h index bd6071d9c67..712c20c9a53 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -14,6 +14,7 @@ #include "table/get_context.h" #include "table/internal_iterator.h" #include "table/multiget_context.h" +#include "table/table_reader_caller.h" namespace rocksdb { @@ -26,9 +27,11 @@ struct TableProperties; class GetContext; class MultiGetContext; -// A Table is a sorted map from strings to strings. Tables are -// immutable and persistent. A Table may be safely accessed from -// multiple threads without external synchronization. +// A Table (also referred to as SST) is a sorted map from strings to strings. +// Tables are immutable and persistent. A Table may be safely accessed from +// multiple threads without external synchronization. Table readers are used +// for reading various types of table formats supported by rocksdb including +// BlockBasedTable, PlainTable and CuckooTable format. class TableReader { public: virtual ~TableReader() {} @@ -42,11 +45,12 @@ class TableReader { // all the states but those allocated in arena. // skip_filters: disables checking the bloom filters even if they exist. This // option is effective only for block-based table format. - virtual InternalIterator* NewIterator(const ReadOptions&, - const SliceTransform* prefix_extractor, - Arena* arena = nullptr, - bool skip_filters = false, - bool for_compaction = false) = 0; + // compaction_readahead_size: its value will only be used if caller = + // kCompaction + virtual InternalIterator* NewIterator( + const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena, + bool skip_filters, TableReaderCaller caller, + size_t compaction_readahead_size = 0) = 0; virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& /*read_options*/) { @@ -59,7 +63,14 @@ class TableReader { // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0; + virtual uint64_t ApproximateOffsetOf(const Slice& key, + TableReaderCaller caller) = 0; + + // Given start and end keys, return the approximate data size in the file + // between the keys. The returned value is in terms of file bytes, and so + // includes effects like compression of the underlying data. + virtual uint64_t ApproximateSize(const Slice& start, const Slice& end, + TableReaderCaller caller) = 0; // Set up the table for Compaction. Might change some parameters with // posix_fadvise @@ -112,17 +123,15 @@ class TableReader { } // convert db file to a human readable form - virtual Status DumpTable(WritableFile* /*out_file*/, - const SliceTransform* /*prefix_extractor*/) { + virtual Status DumpTable(WritableFile* /*out_file*/) { return Status::NotSupported("DumpTable() not supported"); } // check whether there is corruption in this db file - virtual Status VerifyChecksum() { + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/, + TableReaderCaller /*caller*/) { return Status::NotSupported("VerifyChecksum() not supported"); } - - virtual void Close() {} }; } // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index a9b75715b5f..05bb2ea25e4 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -11,21 +11,21 @@ int main() { } #else -#include "db/db_impl.h" +#include "db/db_impl/db_impl.h" #include "db/dbformat.h" +#include "file/random_access_file_reader.h" #include "monitoring/histogram.h" #include "rocksdb/db.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/get_context.h" #include "table/internal_iterator.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/table_builder.h" -#include "util/file_reader_writer.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/gflags_compat.h" -#include "util/testharness.h" -#include "util/testutil.h" using GFLAGS_NAMESPACE::ParseCommandLineFlags; using GFLAGS_NAMESPACE::SetUsageMessage; @@ -175,7 +175,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, ioptions.merge_operator, ioptions.info_log, ioptions.statistics, GetContext::kNotFound, Slice(key), &value, nullptr, &merge_context, - &max_covering_tombstone_seq, env); + true, &max_covering_tombstone_seq, env); s = table_reader->Get(read_options, key, &get_context, nullptr); } else { s = db->Get(read_options, key, &result); @@ -198,7 +198,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, Iterator* iter = nullptr; InternalIterator* iiter = nullptr; if (!through_db) { - iiter = table_reader->NewIterator(read_options, nullptr); + iiter = table_reader->NewIterator( + read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized); } else { iter = db->NewIterator(read_options); } diff --git a/table/table_reader_caller.h b/table/table_reader_caller.h new file mode 100644 index 00000000000..90c64687197 --- /dev/null +++ b/table/table_reader_caller.h @@ -0,0 +1,39 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +namespace rocksdb { +// A list of callers for a table reader. It is used to trace the caller that +// accesses on a block. This is only used for block cache tracing and analysis. +// A user may use kUncategorized if the caller is not interesting for analysis +// or the table reader is called in the test environment, e.g., unit test, table +// reader benchmark, etc. +enum TableReaderCaller : char { + kUserGet = 1, + kUserMultiGet = 2, + kUserIterator = 3, + kUserApproximateSize = 4, + kUserVerifyChecksum = 5, + kSSTDumpTool = 6, + kExternalSSTIngestion = 7, + kRepair = 8, + kPrefetch = 9, + kCompaction = 10, + // A compaction job may refill the block cache with blocks in the new SST + // files if paranoid_file_checks is true. + kCompactionRefill = 11, + // After building a table, it may load all its blocks into the block cache if + // paranoid_file_checks is true. + kFlush = 12, + // sst_file_reader. + kSSTFileReader = 13, + // A list of callers that are either not interesting for analysis or are + // calling from a test environment, e.g., unit test, benchmark, etc. + kUncategorized = 14, + // All callers should be added before kMaxBlockCacheLookupCaller. + kMaxBlockCacheLookupCaller +}; +} // namespace rocksdb diff --git a/table/table_test.cc b/table/table_test.cc index a62ce4255e3..77b96259830 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -16,11 +16,13 @@ #include #include +#include "block_fetcher.h" #include "cache/lru_cache.h" #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" #include "memtable/stl_wrappers.h" +#include "meta_blocks.h" #include "monitoring/statistics.h" #include "port/port.h" #include "rocksdb/cache.h" @@ -32,26 +34,24 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" #include "rocksdb/write_buffer_manager.h" -#include "table/block.h" -#include "table/block_based_table_builder.h" -#include "table/block_based_table_factory.h" -#include "table/block_based_table_reader.h" -#include "table/block_builder.h" -#include "table/block_fetcher.h" -#include "table/flush_block_policy.h" +#include "table/block_based/block.h" +#include "table/block_based/block_based_table_builder.h" +#include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/block_based/block_builder.h" +#include "table/block_based/flush_block_policy.h" #include "table/format.h" #include "table/get_context.h" #include "table/internal_iterator.h" -#include "table/meta_blocks.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" #include "util/compression.h" #include "util/random.h" #include "util/string_util.h" -#include "util/sync_point.h" -#include "util/testharness.h" -#include "util/testutil.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -63,6 +63,8 @@ extern const uint64_t kPlainTableMagicNumber; namespace { +const std::string kDummyValue(10000, 'o'); + // DummyPropertiesCollector used to test BlockBasedTableProperties class DummyPropertiesCollector : public TablePropertiesCollector { public: @@ -236,7 +238,7 @@ class BlockConstructor: public Constructor { } InternalIterator* NewIterator( const SliceTransform* /*prefix_extractor*/) const override { - return block_->NewIterator(comparator_, comparator_); + return block_->NewDataIterator(comparator_, comparator_); } private: @@ -308,10 +310,13 @@ class TableConstructor: public Constructor { public: explicit TableConstructor(const Comparator* cmp, bool convert_to_internal_key = false, - int level = -1) + int level = -1, SequenceNumber largest_seqno = 0) : Constructor(cmp), + largest_seqno_(largest_seqno), convert_to_internal_key_(convert_to_internal_key), - level_(level) {} + level_(level) { + env_ = rocksdb::Env::Default(); + } ~TableConstructor() override { Reset(); } Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions, @@ -326,6 +331,14 @@ class TableConstructor: public Constructor { std::unique_ptr builder; std::vector> int_tbl_prop_collector_factories; + + if (largest_seqno_ != 0) { + // Pretend that it's an external file written by SstFileWriter. + int_tbl_prop_collector_factories.emplace_back( + new SstFileWriterPropertiesCollectorFactory(2 /* version */, + 0 /* global_seqno*/)); + } + std::string column_family_name; builder.reset(ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, internal_comparator, @@ -362,7 +375,7 @@ class TableConstructor: public Constructor { return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions, internal_comparator, !kSkipFilters, !kImmortal, - level_), + level_, largest_seqno_, &block_cache_tracer_), std::move(file_reader_), TEST_GetSink()->contents().size(), &table_reader_); } @@ -370,7 +383,9 @@ class TableConstructor: public Constructor { InternalIterator* NewIterator( const SliceTransform* prefix_extractor) const override { ReadOptions ro; - InternalIterator* iter = table_reader_->NewIterator(ro, prefix_extractor); + InternalIterator* iter = table_reader_->NewIterator( + ro, prefix_extractor, /*arena=*/nullptr, /*skip_filters=*/false, + TableReaderCaller::kUncategorized); if (convert_to_internal_key_) { return new KeyConvertingIterator(iter); } else { @@ -382,9 +397,11 @@ class TableConstructor: public Constructor { if (convert_to_internal_key_) { InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); const Slice skey = ikey.Encode(); - return table_reader_->ApproximateOffsetOf(skey); + return table_reader_->ApproximateOffsetOf( + skey, TableReaderCaller::kUncategorized); } - return table_reader_->ApproximateOffsetOf(key); + return table_reader_->ApproximateOffsetOf( + key, TableReaderCaller::kUncategorized); } virtual Status Reopen(const ImmutableCFOptions& ioptions, @@ -412,6 +429,8 @@ class TableConstructor: public Constructor { return static_cast(file_writer_->writable_file()); } + BlockCacheTracer block_cache_tracer_; + private: void Reset() { uniq_id_ = 0; @@ -424,6 +443,7 @@ class TableConstructor: public Constructor { std::unique_ptr file_writer_; std::unique_ptr file_reader_; std::unique_ptr table_reader_; + SequenceNumber largest_seqno_; bool convert_to_internal_key_; int level_; @@ -431,6 +451,7 @@ class TableConstructor: public Constructor { static uint64_t cur_uniq_id_; EnvOptions soptions; + Env* env_; }; uint64_t TableConstructor::cur_uniq_id_ = 1; @@ -1049,7 +1070,9 @@ class BlockBasedTableTest : public TableTest, virtual public ::testing::WithParamInterface { public: - BlockBasedTableTest() : format_(GetParam()) {} + BlockBasedTableTest() : format_(GetParam()) { + env_ = rocksdb::Env::Default(); + } BlockBasedTableOptions GetBlockBasedTableOptions() { BlockBasedTableOptions options; @@ -1057,11 +1080,91 @@ class BlockBasedTableTest return options; } + void SetupTracingTest(TableConstructor* c) { + test_path_ = test::PerThreadDBPath("block_based_table_tracing_test"); + EXPECT_OK(env_->CreateDir(test_path_)); + trace_file_path_ = test_path_ + "/block_cache_trace_file"; + TraceOptions trace_opt; + std::unique_ptr trace_writer; + EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_, + &trace_writer)); + c->block_cache_tracer_.StartTrace(env_, trace_opt, std::move(trace_writer)); + { + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + c->Add(encoded_key, kDummyValue); + } + { + std::string user_key = "k02"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + c->Add(encoded_key, kDummyValue); + } + } + + void VerifyBlockAccessTrace( + TableConstructor* c, + const std::vector& expected_records) { + c->block_cache_tracer_.EndTrace(); + + std::unique_ptr trace_reader; + Status s = + NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader); + EXPECT_OK(s); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + EXPECT_OK(reader.ReadHeader(&header)); + uint32_t index = 0; + while (s.ok()) { + BlockCacheTraceRecord access; + s = reader.ReadAccess(&access); + if (!s.ok()) { + break; + } + ASSERT_LT(index, expected_records.size()); + EXPECT_NE("", access.block_key); + EXPECT_EQ(access.block_type, expected_records[index].block_type); + EXPECT_GT(access.block_size, 0); + EXPECT_EQ(access.caller, expected_records[index].caller); + EXPECT_EQ(access.no_insert, expected_records[index].no_insert); + EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit); + // Get + if (access.caller == TableReaderCaller::kUserGet) { + EXPECT_EQ(access.referenced_key, + expected_records[index].referenced_key); + EXPECT_EQ(access.get_id, expected_records[index].get_id); + EXPECT_EQ(access.get_from_user_specified_snapshot, + expected_records[index].get_from_user_specified_snapshot); + if (access.block_type == TraceType::kBlockTraceDataBlock) { + EXPECT_GT(access.referenced_data_size, 0); + EXPECT_GT(access.num_keys_in_block, 0); + EXPECT_EQ(access.referenced_key_exist_in_block, + expected_records[index].referenced_key_exist_in_block); + } + } else { + EXPECT_EQ(access.referenced_key, ""); + EXPECT_EQ(access.get_id, 0); + EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse); + EXPECT_EQ(access.referenced_data_size, 0); + EXPECT_EQ(access.num_keys_in_block, 0); + EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse); + } + index++; + } + EXPECT_EQ(index, expected_records.size()); + EXPECT_OK(env_->DeleteFile(trace_file_path_)); + EXPECT_OK(env_->DeleteDir(test_path_)); + } + protected: uint64_t IndexUncompressedHelper(bool indexCompress); private: uint32_t format_; + Env* env_; + std::string trace_file_path_; + std::string test_path_; }; class PlainTableTest : public TableTest {}; class TablePropertyTest : public testing::Test {}; @@ -1480,7 +1583,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) { TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - for (int i = 0; i < 4; ++i) { + for (int i = 0; i <= 5; ++i) { Options options; // Make each key/value an individual block table_options.block_size = 64; @@ -1511,11 +1614,16 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { options.prefix_extractor.reset(NewFixedPrefixTransform(4)); break; case 4: - default: - // Binary search index + // Two-level index table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; options.table_factory.reset(new BlockBasedTableFactory(table_options)); break; + case 5: + // Binary search with first key + table_options.index_type = + BlockBasedTableOptions::kBinarySearchWithFirstKey; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; } TableConstructor c(BytewiseComparator(), @@ -1538,8 +1646,9 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { auto* reader = c.GetTableReader(); ReadOptions ro; ro.total_order_seek = true; - std::unique_ptr iter( - reader->NewIterator(ro, moptions.prefix_extractor.get())); + std::unique_ptr iter(reader->NewIterator( + ro, moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); iter->Seek(InternalKey("b", 0, kTypeValue).Encode()); ASSERT_OK(iter->status()); @@ -1597,8 +1706,9 @@ TEST_P(BlockBasedTableTest, NoopTransformSeek) { for (int i = 0; i < 2; ++i) { ReadOptions ro; ro.total_order_seek = (i == 0); - std::unique_ptr iter( - reader->NewIterator(ro, moptions.prefix_extractor.get())); + std::unique_ptr iter(reader->NewIterator( + ro, moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); iter->Seek(key.Encode()); ASSERT_OK(iter->status()); @@ -1635,8 +1745,9 @@ TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) { const MutableCFOptions new_moptions(options); c.Reopen(new_ioptions, new_moptions); auto reader = c.GetTableReader(); - std::unique_ptr db_iter( - reader->NewIterator(ReadOptions(), new_moptions.prefix_extractor.get())); + std::unique_ptr db_iter(reader->NewIterator( + ReadOptions(), new_moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // Test point lookup // only one kv @@ -1656,10 +1767,10 @@ static std::string RandomString(Random* rnd, int len) { } void AddInternalKey(TableConstructor* c, const std::string& prefix, - int /*suffix_len*/ = 800) { + std::string value = "v", int /*suffix_len*/ = 800) { static Random rnd(1023); InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); - c->Add(k.Encode().ToString(), "v"); + c->Add(k.Encode().ToString(), value); } void TableTest::IndexTest(BlockBasedTableOptions table_options) { @@ -1702,8 +1813,9 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) { ASSERT_EQ(5u, props->num_data_blocks); // TODO(Zhongyi): update test to use MutableCFOptions - std::unique_ptr index_iter( - reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + std::unique_ptr index_iter(reader->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // -- Find keys do not exist, but have common prefix. std::vector prefixes = {"001", "003", "005", "007", "009"}; @@ -1798,6 +1910,325 @@ TEST_P(BlockBasedTableTest, PartitionIndexTest) { } } +TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) { + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + Options options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + + TableConstructor c(BytewiseComparator()); + AddInternalKey(&c, "pika"); + + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + ASSERT_EQ(1, keys.size()); + + auto reader = c.GetTableReader(); + ReadOptions ropt; + ropt.read_tier = ReadTier::kBlockCacheTier; + std::unique_ptr iter(reader->NewIterator( + ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + auto ikey = [](Slice user_key) { + return InternalKey(user_key, 0, kTypeValue).Encode().ToString(); + }; + + iter->Seek(ikey("pika")); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsIncomplete()); + + // This used to crash at some point. + iter->Seek(ikey("pika")); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsIncomplete()); +} + +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey; + IndexTest(table_options); +} + +class CustomFlushBlockPolicy : public FlushBlockPolicyFactory, + public FlushBlockPolicy { + public: + explicit CustomFlushBlockPolicy(std::vector keys_per_block) + : keys_per_block_(keys_per_block) {} + + const char* Name() const override { return "table_test"; } + FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&, + const BlockBuilder&) const override { + return new CustomFlushBlockPolicy(keys_per_block_); + } + + bool Update(const Slice&, const Slice&) override { + if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) { + ++current_block_idx_; + keys_in_current_block_ = 1; + return true; + } + + ++keys_in_current_block_; + return false; + } + + std::vector keys_per_block_; + + int current_block_idx_ = 0; + int keys_in_current_block_ = 0; +}; + +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) { + for (int use_first_key = 0; use_first_key < 2; ++use_first_key) { + SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key)); + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = + use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey + : BlockBasedTableOptions::kBinarySearch; + table_options.block_cache = NewLRUCache(10000); // fits all blocks + table_options.index_shortening = + BlockBasedTableOptions::IndexShorteningMode::kNoShortening; + table_options.flush_block_policy_factory = + std::make_shared(std::vector{2, 1, 3, 2}); + Options options; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + + TableConstructor c(BytewiseComparator()); + + // Block 0. + AddInternalKey(&c, "aaaa", "v0"); + AddInternalKey(&c, "aaac", "v1"); + + // Block 1. + AddInternalKey(&c, "aaca", "v2"); + + // Block 2. + AddInternalKey(&c, "caaa", "v3"); + AddInternalKey(&c, "caac", "v4"); + AddInternalKey(&c, "caae", "v5"); + + // Block 3. + AddInternalKey(&c, "ccaa", "v6"); + AddInternalKey(&c, "ccac", "v7"); + + // Write the file. + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + ASSERT_EQ(8, keys.size()); + + auto reader = c.GetTableReader(); + auto props = reader->GetTableProperties(); + ASSERT_EQ(4u, props->num_data_blocks); + std::unique_ptr iter(reader->NewIterator( + ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + // Shouldn't have read data blocks before iterator is seeked. + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + auto ikey = [](Slice user_key) { + return InternalKey(user_key, 0, kTypeValue).Encode().ToString(); + }; + + // Seek to a key between blocks. If index contains first key, we shouldn't + // read any data blocks until value is requested. + iter->Seek(ikey("aaba")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 0 : 1, + stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v2", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to the middle of a block. The block should be read right away. + iter->Seek(ikey("caab")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[4], iter->key().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v4", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to just before the same block and don't access value. + // The iterator should keep pinning the block contents. + iter->Seek(ikey("baaa")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[3], iter->key().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to the same block again to check that the block is still pinned. + iter->Seek(ikey("caae")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[5], iter->key().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v5", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward and fall through to the next block. Don't access value. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[6], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward again. Block should be read. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[7], iter->key().ToString()); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v7", iter->value().ToString()); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Step forward and reach the end. + iter->Next(); + EXPECT_FALSE(iter->Valid()); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to a single-key block and step forward without accessing value. + iter->Seek(ikey("aaca")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 0 : 1, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[3], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 1 : 2, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v3", iter->value().ToString()); + EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + // Seek between blocks and step back without accessing value. + iter->Seek(ikey("aaca")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[1], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 3, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + // All blocks are in cache now, there'll be no more misses ever. + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ("v1", iter->value().ToString()); + + // Next into the next block again. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[2], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 4, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Seek to first and step back without accessing value. + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[0], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 2 : 5, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->Prev(); + EXPECT_FALSE(iter->Valid()); + EXPECT_EQ(use_first_key ? 2 : 5, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + // Do some SeekForPrev() and SeekToLast() just to cover all methods. + iter->SeekForPrev(ikey("caad")); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[4], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 3 : 6, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v4", iter->value().ToString()); + EXPECT_EQ(use_first_key ? 3 : 6, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(keys[7], iter->key().ToString()); + EXPECT_EQ(use_first_key ? 4 : 7, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ("v7", iter->value().ToString()); + EXPECT_EQ(use_first_key ? 4 : 7, + stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + + EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + c.ResetTableReader(); + } +} + +TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) { + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey; + table_options.block_cache = NewLRUCache(10000); + Options options; + options.statistics = CreateDBStatistics(); + Statistics* stats = options.statistics.get(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + + TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false, + /* level */ -1, /* largest_seqno */ 42); + + c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x"); + c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y"); + + std::vector keys; + stl_wrappers::KVMap kvmap; + c.Finish(options, ioptions, moptions, table_options, *comparator, &keys, + &kvmap); + ASSERT_EQ(2, keys.size()); + + auto reader = c.GetTableReader(); + auto props = reader->GetTableProperties(); + ASSERT_EQ(1u, props->num_data_blocks); + std::unique_ptr iter(reader->NewIterator( + ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString()); + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(), + iter->key().ToString()); + EXPECT_NE(keys[0], iter->key().ToString()); + // Key should have been served from index, without reading data blocks. + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + + EXPECT_EQ("x", iter->value().ToString()); + EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); + EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); + EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(), + iter->key().ToString()); + + c.ResetTableReader(); +} + // It's very hard to figure out the index block size of a block accurately. // To make sure we get the index size, we just make sure as key number // grows, the filter block size also grows. @@ -1869,6 +2300,187 @@ TEST_P(BlockBasedTableTest, NumBlockStat) { c.ResetTableReader(); } +TEST_P(BlockBasedTableTest, TracingGetTest) { + TableConstructor c(BytewiseComparator()); + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + table_options.block_cache = NewLRUCache(1024 * 1024, 0); + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + SetupTracingTest(&c); + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + for (uint32_t i = 1; i <= 2; i++) { + PinnableSlice value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, true, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, /*tracing_get_id=*/i); + get_perf_context()->Reset(); + ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_EQ(value.ToString(), kDummyValue); + } + + // Verify traces. + std::vector expected_records; + // The first two records should be prefetching index and filter blocks. + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kPrefetch; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + // Then we should have three records for one index, one filter, and one data + // block access. + record.get_id = 1; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kUserGet; + record.get_from_user_specified_snapshot = Boolean::kFalse; + record.referenced_key = encoded_key; + record.referenced_key_exist_in_block = Boolean::kTrue; + record.is_cache_hit = Boolean::kTrue; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + record.is_cache_hit = Boolean::kFalse; + record.block_type = TraceType::kBlockTraceDataBlock; + expected_records.push_back(record); + // The second get should all observe cache hits. + record.is_cache_hit = Boolean::kTrue; + record.get_id = 2; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kUserGet; + record.get_from_user_specified_snapshot = Boolean::kFalse; + record.referenced_key = encoded_key; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceDataBlock; + expected_records.push_back(record); + VerifyBlockAccessTrace(&c, expected_records); + c.ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) { + TableConstructor c(BytewiseComparator()); + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + table_options.block_cache = NewLRUCache(1024 * 1024, 0); + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + SetupTracingTest(&c); + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + for (uint32_t i = 1; i <= 2; i++) { + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + std::string encoded_key = internal_key.Encode().ToString(); + c.GetTableReader()->ApproximateOffsetOf( + encoded_key, TableReaderCaller::kUserApproximateSize); + } + // Verify traces. + std::vector expected_records; + // The first two records should be prefetching index and filter blocks. + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kPrefetch; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + // Then we should have two records for only index blocks. + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kUserApproximateSize; + record.is_cache_hit = Boolean::kTrue; + expected_records.push_back(record); + expected_records.push_back(record); + VerifyBlockAccessTrace(&c, expected_records); + c.ResetTableReader(); +} + +TEST_P(BlockBasedTableTest, TracingIterator) { + TableConstructor c(BytewiseComparator()); + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + table_options.block_cache = NewLRUCache(1024 * 1024, 0); + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + SetupTracingTest(&c); + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableCFOptions ioptions(options); + MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + for (uint32_t i = 1; i <= 2; i++) { + std::unique_ptr iter(c.GetTableReader()->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUserIterator)); + iter->SeekToFirst(); + while (iter->Valid()) { + iter->key(); + iter->value(); + iter->Next(); + } + ASSERT_OK(iter->status()); + iter.reset(); + } + + // Verify traces. + std::vector expected_records; + // The first two records should be prefetching index and filter blocks. + BlockCacheTraceRecord record; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kPrefetch; + record.is_cache_hit = Boolean::kFalse; + record.no_insert = Boolean::kFalse; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + // Then we should have three records for index and two data block access. + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kUserIterator; + record.is_cache_hit = Boolean::kTrue; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceDataBlock; + record.is_cache_hit = Boolean::kFalse; + expected_records.push_back(record); + expected_records.push_back(record); + // When we iterate this file for the second time, we should observe all cache + // hits. + record.block_type = TraceType::kBlockTraceIndexBlock; + record.is_cache_hit = Boolean::kTrue; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceDataBlock; + expected_records.push_back(record); + expected_records.push_back(record); + VerifyBlockAccessTrace(&c, expected_records); + c.ResetTableReader(); +} + // A simple tool that takes the snapshot of block cache statistics. class BlockCachePropertiesSnapshot { public: @@ -1954,8 +2566,8 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { // preloading filter/index blocks is enabled. auto reader = dynamic_cast(c.GetTableReader()); - ASSERT_TRUE(reader->TEST_filter_block_preloaded()); - ASSERT_TRUE(reader->TEST_index_reader_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); + ASSERT_FALSE(reader->TEST_IndexBlockInCache()); { // nothing happens in the beginning @@ -1967,7 +2579,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) { { GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(), nullptr, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); // a hack that just to trigger BlockBasedTable::GetFilter. reader->Get(ReadOptions(), "non-exist-key", &get_context, moptions.prefix_extractor.get()); @@ -1987,7 +2599,11 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { // Enable the cache for index/filter blocks BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - table_options.block_cache = NewLRUCache(2048, 2); + LRUCacheOptions co; + co.capacity = 2048; + co.num_shard_bits = 2; + co.metadata_charge_policy = kDontChargeCacheMetadata; + table_options.block_cache = NewLRUCache(co); table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(new BlockBasedTableFactory(table_options)); std::vector keys; @@ -2001,8 +2617,8 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is prohibited. auto* reader = dynamic_cast(c.GetTableReader()); - ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); - ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); + ASSERT_TRUE(reader->TEST_IndexBlockInCache()); // -- PART 1: Open with regular block cache. // Since block_cache is disabled, no cache activities will be involved. @@ -2017,7 +2633,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { 0, 0, 0); ASSERT_EQ(props.GetCacheBytesRead(), 0); ASSERT_EQ(props.GetCacheBytesWrite(), - table_options.block_cache->GetUsage()); + static_cast(table_options.block_cache->GetUsage())); last_cache_bytes_read = props.GetCacheBytesRead(); } @@ -2033,7 +2649,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { // Cache hit, bytes read from cache should increase ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read); ASSERT_EQ(props.GetCacheBytesWrite(), - table_options.block_cache->GetUsage()); + static_cast(table_options.block_cache->GetUsage())); last_cache_bytes_read = props.GetCacheBytesRead(); } @@ -2046,7 +2662,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { // Cache miss, Bytes read from cache should not change ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read); ASSERT_EQ(props.GetCacheBytesWrite(), - table_options.block_cache->GetUsage()); + static_cast(table_options.block_cache->GetUsage())); last_cache_bytes_read = props.GetCacheBytesRead(); } @@ -2060,7 +2676,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { // Cache hit, bytes read from cache should increase ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read); ASSERT_EQ(props.GetCacheBytesWrite(), - table_options.block_cache->GetUsage()); + static_cast(table_options.block_cache->GetUsage())); } // release the iterator so that the block cache can reset correctly. iter.reset(); @@ -2134,11 +2750,11 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) { MutableCFOptions moptions4(options); ASSERT_OK(c3.Reopen(ioptions4, moptions4)); reader = dynamic_cast(c3.GetTableReader()); - ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + ASSERT_FALSE(reader->TEST_FilterBlockInCache()); PinnableSlice value; GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context, moptions4.prefix_extractor.get())); ASSERT_STREQ(value.data(), "hello"); @@ -2221,21 +2837,25 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto reader = c.GetTableReader(); PinnableSlice value; - GetContext get_context(options.comparator, nullptr, nullptr, nullptr, - GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); - get_perf_context()->Reset(); - ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, - moptions.prefix_extractor.get())); - if (index_and_filter_in_cache) { - // data, index and filter block - ASSERT_EQ(get_perf_context()->block_read_count, 3); - } else { - // just the data block - ASSERT_EQ(get_perf_context()->block_read_count, 1); + { + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, nullptr, + nullptr, true, nullptr, nullptr); + get_perf_context()->Reset(); + ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, + moptions.prefix_extractor.get())); + if (index_and_filter_in_cache) { + // data, index and filter block + ASSERT_EQ(get_perf_context()->block_read_count, 3); + ASSERT_EQ(get_perf_context()->index_block_read_count, 1); + ASSERT_EQ(get_perf_context()->filter_block_read_count, 1); + } else { + // just the data block + ASSERT_EQ(get_perf_context()->block_read_count, 1); + } + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_STREQ(value.data(), "hello"); } - ASSERT_EQ(get_context.State(), GetContext::kFound); - ASSERT_STREQ(value.data(), "hello"); // Get non-existing key user_key = "does-not-exist"; @@ -2243,21 +2863,26 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { encoded_key = internal_key.Encode().ToString(); value.Reset(); - get_context = GetContext(options.comparator, nullptr, nullptr, nullptr, + { + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); - get_perf_context()->Reset(); - ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, - moptions.prefix_extractor.get())); - ASSERT_EQ(get_context.State(), GetContext::kNotFound); + nullptr, true, nullptr, nullptr); + get_perf_context()->Reset(); + ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context, + moptions.prefix_extractor.get())); + ASSERT_EQ(get_context.State(), GetContext::kNotFound); + } if (index_and_filter_in_cache) { if (bloom_filter_type == 0) { // with block-based, we read index and then the filter ASSERT_EQ(get_perf_context()->block_read_count, 2); + ASSERT_EQ(get_perf_context()->index_block_read_count, 1); + ASSERT_EQ(get_perf_context()->filter_block_read_count, 1); } else { // with full-filter, we read filter first and then we stop ASSERT_EQ(get_perf_context()->block_read_count, 1); + ASSERT_EQ(get_perf_context()->filter_block_read_count, 1); } } else { // filter is already in memory and it figures out that the key doesn't @@ -2268,176 +2893,6 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) { } } -// A wrapper around LRICache that also keeps track of data blocks (in contrast -// with the objects) in the cache. The class is very simple and can be used only -// for trivial tests. -class MockCache : public LRUCache { - public: - MockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio) - : LRUCache(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio) {} - Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), - Handle** handle = nullptr, - Priority priority = Priority::LOW) override { - // Replace the deleter with our own so that we keep track of data blocks - // erased from the cache - deleters_[key.ToString()] = deleter; - return ShardedCache::Insert(key, value, charge, &MockDeleter, handle, - priority); - } - // This is called by the application right after inserting a data block - void TEST_mark_as_data_block(const Slice& key, size_t charge) override { - marked_data_in_cache_[key.ToString()] = charge; - marked_size_ += charge; - } - using DeleterFunc = void (*)(const Slice& key, void* value); - static std::map deleters_; - static std::map marked_data_in_cache_; - static size_t marked_size_; - static void MockDeleter(const Slice& key, void* value) { - // If the item was marked for being data block, decrease its usage from the - // total data block usage of the cache - if (marked_data_in_cache_.find(key.ToString()) != - marked_data_in_cache_.end()) { - marked_size_ -= marked_data_in_cache_[key.ToString()]; - } - // Then call the origianl deleter - assert(deleters_.find(key.ToString()) != deleters_.end()); - auto deleter = deleters_[key.ToString()]; - deleter(key, value); - } -}; - -size_t MockCache::marked_size_ = 0; -std::map MockCache::deleters_; -std::map MockCache::marked_data_in_cache_; - -// Block cache can contain raw data blocks as well as general objects. If an -// object depends on the table to be live, it then must be destructed before the -// table is closed. This test makes sure that the only items remains in the -// cache after the table is closed are raw data blocks. -TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) { - std::vector compression_types{kNoCompression}; - - // The following are the compression library versions supporting compression - // dictionaries. See the test case CacheCompressionDict in the - // DBBlockCacheTest suite. -#ifdef ZLIB - compression_types.push_back(kZlibCompression); -#endif // ZLIB -#if LZ4_VERSION_NUMBER >= 10400 - compression_types.push_back(kLZ4Compression); - compression_types.push_back(kLZ4HCCompression); -#endif // LZ4_VERSION_NUMBER >= 10400 -#if ZSTD_VERSION_NUMBER >= 500 - compression_types.push_back(kZSTD); -#endif // ZSTD_VERSION_NUMBER >= 500 - - for (int level: {-1, 0, 1, 10}) { - for (auto index_type : - {BlockBasedTableOptions::IndexType::kBinarySearch, - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) { - for (bool block_based_filter : {true, false}) { - for (bool partition_filter : {true, false}) { - if (partition_filter && - (block_based_filter || - index_type != - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch)) { - continue; - } - for (bool index_and_filter_in_cache : {true, false}) { - for (bool pin_l0 : {true, false}) { - for (bool pin_top_level : {true, false}) { - if (pin_l0 && !index_and_filter_in_cache) { - continue; - } - - for (auto compression_type : compression_types) { - for (uint32_t max_dict_bytes : {0, 1 << 14}) { - if (compression_type == kNoCompression && max_dict_bytes) - continue; - - // Create a table - Options opt; - std::unique_ptr ikc; - ikc.reset(new test::PlainInternalKeyComparator( - opt.comparator)); - opt.compression = compression_type; - opt.compression_opts.max_dict_bytes = max_dict_bytes; - BlockBasedTableOptions table_options = - GetBlockBasedTableOptions(); - table_options.block_size = 1024; - table_options.index_type = index_type; - table_options.pin_l0_filter_and_index_blocks_in_cache = - pin_l0; - table_options.pin_top_level_index_and_filter = - pin_top_level; - table_options.partition_filters = partition_filter; - table_options.cache_index_and_filter_blocks = - index_and_filter_in_cache; - // big enough so we don't ever lose cached values. - table_options.block_cache = std::make_shared( - 16 * 1024 * 1024, 4, false, 0.0); - table_options.filter_policy.reset( - rocksdb::NewBloomFilterPolicy(10, block_based_filter)); - opt.table_factory.reset(NewBlockBasedTableFactory( - table_options)); - - bool convert_to_internal_key = false; - TableConstructor c(BytewiseComparator(), - convert_to_internal_key, level); - std::string user_key = "k01"; - std::string key = - InternalKey(user_key, 0, kTypeValue).Encode().ToString(); - c.Add(key, "hello"); - std::vector keys; - stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); - const MutableCFOptions moptions(opt); - c.Finish(opt, ioptions, moptions, table_options, *ikc, - &keys, &kvmap); - - // Doing a read to make index/filter loaded into the cache - auto table_reader = - dynamic_cast(c.GetTableReader()); - PinnableSlice value; - GetContext get_context(opt.comparator, nullptr, nullptr, - nullptr, GetContext::kNotFound, user_key, &value, - nullptr, nullptr, nullptr, nullptr); - InternalKey ikey(user_key, 0, kTypeValue); - auto s = table_reader->Get(ReadOptions(), key, &get_context, - moptions.prefix_extractor.get()); - ASSERT_EQ(get_context.State(), GetContext::kFound); - ASSERT_STREQ(value.data(), "hello"); - - // Close the table - c.ResetTableReader(); - - auto usage = table_options.block_cache->GetUsage(); - auto pinned_usage = - table_options.block_cache->GetPinnedUsage(); - // The only usage must be for marked data blocks - ASSERT_EQ(usage, MockCache::marked_size_); - // There must be some pinned data since PinnableSlice has - // not released them yet - ASSERT_GT(pinned_usage, 0); - // Release pinnable slice reousrces - value.Reset(); - pinned_usage = table_options.block_cache->GetPinnedUsage(); - ASSERT_EQ(pinned_usage, 0); - } - } - } - } - } - } - } - } - } // level -} - TEST_P(BlockBasedTableTest, BlockCacheLeak) { // Check that when we reopen a table we don't lose access to blocks already // in the cache. This test checks whether the Table actually makes use of the @@ -2574,69 +3029,6 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) { EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0); } -TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) { - // A regression test to avoid data race described in - // https://github.com/facebook/rocksdb/issues/1267 - TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); - std::vector keys; - stl_wrappers::KVMap kvmap; - c.Add("a1", "val1"); - Options options; - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - table_options.index_type = BlockBasedTableOptions::kHashSearch; - table_options.cache_index_and_filter_blocks = true; - table_options.block_cache = NewLRUCache(0); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - const ImmutableCFOptions ioptions(options); - const MutableCFOptions moptions(options); - c.Finish(options, ioptions, moptions, table_options, - GetPlainInternalComparator(options.comparator), &keys, &kvmap); - - rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers( - { - {"BlockBasedTable::NewIndexIterator::thread1:1", - "BlockBasedTable::NewIndexIterator::thread2:2"}, - {"BlockBasedTable::NewIndexIterator::thread2:3", - "BlockBasedTable::NewIndexIterator::thread1:4"}, - }, - { - {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker", - "BlockBasedTable::NewIndexIterator::thread1:1"}, - {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker", - "BlockBasedTable::NewIndexIterator::thread1:4"}, - {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker", - "BlockBasedTable::NewIndexIterator::thread2:2"}, - {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker", - "BlockBasedTable::NewIndexIterator::thread2:3"}, - }); - - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - ReadOptions ro; - auto* reader = c.GetTableReader(); - - std::function func1 = [&]() { - TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker"); - // TODO(Zhongyi): update test to use MutableCFOptions - std::unique_ptr iter( - reader->NewIterator(ro, moptions.prefix_extractor.get())); - iter->Seek(InternalKey("a1", 0, kTypeValue).Encode()); - }; - - std::function func2 = [&]() { - TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker"); - std::unique_ptr iter( - reader->NewIterator(ro, moptions.prefix_extractor.get())); - }; - - auto thread1 = port::Thread(func1); - auto thread2 = port::Thread(func2); - thread1.join(); - thread2.join(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); - c.ResetTableReader(); -} - // Plain table is not supported in ROCKSDB_LITE #ifndef ROCKSDB_LITE TEST_F(PlainTableTest, BasicPlainTableProperties) { @@ -2912,7 +3304,8 @@ TEST_F(MemTableTest, Simple) { batch.DeleteRange(std::string("begin"), std::string("end")); ColumnFamilyMemTablesDefault cf_mems_default(memtable); ASSERT_TRUE( - WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr).ok()); + WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr) + .ok()); for (int i = 0; i < 2; ++i) { Arena arena; @@ -3156,8 +3549,9 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { &kvmap); auto reader = c.GetTableReader(); - std::unique_ptr db_iter( - reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + std::unique_ptr db_iter(reader->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); // Test point lookup for (auto& kv : kvmap) { @@ -3349,13 +3743,14 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { EnvOptions(), ikc), std::move(file_reader), ss_rw.contents().size(), &table_reader); - return table_reader->NewIterator(ReadOptions(), - moptions.prefix_extractor.get()); + return table_reader->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized); }; GetVersionAndGlobalSeqno(); - ASSERT_EQ(2, version); - ASSERT_EQ(0, global_seqno); + ASSERT_EQ(2u, version); + ASSERT_EQ(0u, global_seqno); InternalIterator* iter = GetTableInternalIter(); char current_c = 'a'; @@ -3375,8 +3770,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { // Update global sequence number to 10 SetGlobalSeqno(10); GetVersionAndGlobalSeqno(); - ASSERT_EQ(2, version); - ASSERT_EQ(10, global_seqno); + ASSERT_EQ(2u, version); + ASSERT_EQ(10u, global_seqno); iter = GetTableInternalIter(); current_c = 'a'; @@ -3412,8 +3807,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { // Update global sequence number to 3 SetGlobalSeqno(3); GetVersionAndGlobalSeqno(); - ASSERT_EQ(2, version); - ASSERT_EQ(3, global_seqno); + ASSERT_EQ(2u, version); + ASSERT_EQ(3u, global_seqno); iter = GetTableInternalIter(); current_c = 'a'; @@ -3521,7 +3916,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { std::move(file_reader), ss_rw.contents().size(), &table_reader)); std::unique_ptr db_iter(table_reader->NewIterator( - ReadOptions(), moptions2.prefix_extractor.get())); + ReadOptions(), moptions2.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); int expected_key = 1; for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { @@ -3590,7 +3986,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size, &footer, kBlockBasedTableMagicNumber)); - auto BlockFetchHelper = [&](const BlockHandle& handle, + auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type, BlockContents* contents) { ReadOptions read_options; read_options.verify_checksums = false; @@ -3599,8 +3995,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { BlockFetcher block_fetcher( file, nullptr /* prefetch_buffer */, footer, read_options, handle, contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - cache_options); + false /*maybe_compressed*/, block_type, + UncompressionDict::GetEmptyDict(), cache_options); ASSERT_OK(block_fetcher.ReadBlockContents()); }; @@ -3609,13 +4005,13 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { auto metaindex_handle = footer.metaindex_handle(); BlockContents metaindex_contents; - BlockFetchHelper(metaindex_handle, &metaindex_contents); + BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex, + &metaindex_contents); Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); - std::unique_ptr meta_iter( - metaindex_block.NewIterator(BytewiseComparator(), - BytewiseComparator())); + std::unique_ptr meta_iter(metaindex_block.NewDataIterator( + BytewiseComparator(), BytewiseComparator())); bool found_properties_block = true; ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block)); ASSERT_TRUE(found_properties_block); @@ -3626,11 +4022,12 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { ASSERT_OK(properties_handle.DecodeFrom(&v)); BlockContents properties_contents; - BlockFetchHelper(properties_handle, &properties_contents); + BlockFetchHelper(properties_handle, BlockType::kProperties, + &properties_contents); Block properties_block(std::move(properties_contents), kDisableGlobalSequenceNumber); - ASSERT_EQ(properties_block.NumRestarts(), 1); + ASSERT_EQ(properties_block.NumRestarts(), 1u); } } @@ -3685,16 +4082,16 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { BlockFetcher block_fetcher( table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(), metaindex_handle, &metaindex_contents, ioptions, false /* decompress */, - false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), - pcache_opts, nullptr /*memory_allocator*/); + false /*maybe_compressed*/, BlockType::kMetaIndex, + UncompressionDict::GetEmptyDict(), pcache_opts, + nullptr /*memory_allocator*/); ASSERT_OK(block_fetcher.ReadBlockContents()); Block metaindex_block(std::move(metaindex_contents), kDisableGlobalSequenceNumber); // verify properties block comes last std::unique_ptr metaindex_iter{ - metaindex_block.NewIterator(options.comparator, - options.comparator)}; + metaindex_block.NewDataIterator(options.comparator, options.comparator)}; uint64_t max_offset = 0; std::string key_at_max_offset; for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid(); @@ -3812,8 +4209,9 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { auto reader = c.GetTableReader(); std::unique_ptr seek_iter; - seek_iter.reset( - reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get())); + seek_iter.reset(reader->NewIterator( + ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); for (int i = 0; i < 2; ++i) { ReadOptions ro; // for every kv, we seek using two method: Get() and Seek() @@ -3837,7 +4235,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { std::string user_key = ExtractUserKey(kv.first).ToString(); GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); ASSERT_OK(reader->Get(ro, kv.first, &get_context, moptions.prefix_extractor.get())); ASSERT_EQ(get_context.State(), GetContext::kFound); @@ -3863,7 +4261,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { PinnableSlice value; GetContext get_context(options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, nullptr); + nullptr, true, nullptr, nullptr); ASSERT_OK(reader->Get(ro, encoded_key, &get_context, moptions.prefix_extractor.get())); ASSERT_EQ(get_context.State(), GetContext::kNotFound); @@ -3894,13 +4292,15 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) { Slice upper_bound_slice(upper_bound); read_opt.iterate_upper_bound = &upper_bound_slice; std::unique_ptr iter; - iter.reset(new KeyConvertingIterator( - reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter.reset(new KeyConvertingIterator(reader->NewIterator( + read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->SeekToFirst(); ASSERT_FALSE(iter->Valid()); ASSERT_TRUE(iter->IsOutOfBound()); - iter.reset(new KeyConvertingIterator( - reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter.reset(new KeyConvertingIterator(reader->NewIterator( + read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->Seek("foo"); ASSERT_FALSE(iter->Valid()); ASSERT_TRUE(iter->IsOutOfBound()); @@ -3930,8 +4330,9 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) { Slice ub_slice1(ub1); read_opt.iterate_upper_bound = &ub_slice1; std::unique_ptr iter; - iter.reset(new KeyConvertingIterator( - reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter.reset(new KeyConvertingIterator(reader->NewIterator( + read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->Seek("bar"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("bar", iter->key()); @@ -3941,8 +4342,9 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) { std::string ub2 = "foo_after"; Slice ub_slice2(ub2); read_opt.iterate_upper_bound = &ub_slice2; - iter.reset(new KeyConvertingIterator( - reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter.reset(new KeyConvertingIterator(reader->NewIterator( + read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized))); iter->Seek("foo"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("foo", iter->key()); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index a8f617dee29..1cb00b63928 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -9,21 +9,21 @@ #include "table/two_level_iterator.h" #include "db/pinned_iterators_manager.h" +#include "memory/arena.h" #include "rocksdb/options.h" #include "rocksdb/table.h" -#include "table/block.h" +#include "table/block_based/block.h" #include "table/format.h" -#include "util/arena.h" namespace rocksdb { namespace { -class TwoLevelIndexIterator : public InternalIteratorBase { +class TwoLevelIndexIterator : public InternalIteratorBase { public: explicit TwoLevelIndexIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter); + InternalIteratorBase* first_level_iter); ~TwoLevelIndexIterator() override { first_level_iter_.DeleteIter(false /* is_arena_mode */); @@ -43,7 +43,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase { assert(Valid()); return second_level_iter_.key(); } - BlockHandle value() const override { + IndexValue value() const override { assert(Valid()); return second_level_iter_.value(); } @@ -69,12 +69,12 @@ class TwoLevelIndexIterator : public InternalIteratorBase { } void SkipEmptyDataBlocksForward(); void SkipEmptyDataBlocksBackward(); - void SetSecondLevelIterator(InternalIteratorBase* iter); + void SetSecondLevelIterator(InternalIteratorBase* iter); void InitDataBlock(); TwoLevelIteratorState* state_; - IteratorWrapperBase first_level_iter_; - IteratorWrapperBase second_level_iter_; // May be nullptr + IteratorWrapperBase first_level_iter_; + IteratorWrapperBase second_level_iter_; // May be nullptr Status status_; // If second_level_iter is non-nullptr, then "data_block_handle_" holds the // "index_value" passed to block_function_ to create the second_level_iter. @@ -83,7 +83,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase { TwoLevelIndexIterator::TwoLevelIndexIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter) + InternalIteratorBase* first_level_iter) : state_(state), first_level_iter_(first_level_iter) {} void TwoLevelIndexIterator::Seek(const Slice& target) { @@ -177,8 +177,8 @@ void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() { } void TwoLevelIndexIterator::SetSecondLevelIterator( - InternalIteratorBase* iter) { - InternalIteratorBase* old_iter = second_level_iter_.Set(iter); + InternalIteratorBase* iter) { + InternalIteratorBase* old_iter = second_level_iter_.Set(iter); delete old_iter; } @@ -186,14 +186,14 @@ void TwoLevelIndexIterator::InitDataBlock() { if (!first_level_iter_.Valid()) { SetSecondLevelIterator(nullptr); } else { - BlockHandle handle = first_level_iter_.value(); + BlockHandle handle = first_level_iter_.value().handle; if (second_level_iter_.iter() != nullptr && !second_level_iter_.status().IsIncomplete() && handle.offset() == data_block_handle_.offset()) { // second_level_iter is already constructed with this iterator, so // no need to change anything } else { - InternalIteratorBase* iter = + InternalIteratorBase* iter = state_->NewSecondaryIterator(handle); data_block_handle_ = handle; SetSecondLevelIterator(iter); @@ -203,9 +203,9 @@ void TwoLevelIndexIterator::InitDataBlock() { } // namespace -InternalIteratorBase* NewTwoLevelIterator( +InternalIteratorBase* NewTwoLevelIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter) { + InternalIteratorBase* first_level_iter) { return new TwoLevelIndexIterator(state, first_level_iter); } } // namespace rocksdb diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index 55d5c01a4ae..545c29f493e 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -22,11 +22,10 @@ struct TwoLevelIteratorState { TwoLevelIteratorState() {} virtual ~TwoLevelIteratorState() {} - virtual InternalIteratorBase* NewSecondaryIterator( + virtual InternalIteratorBase* NewSecondaryIterator( const BlockHandle& handle) = 0; }; - // Return a new two level iterator. A two-level iterator contains an // index iterator whose values point to a sequence of blocks where // each block is itself a sequence of key,value pairs. The returned @@ -37,8 +36,8 @@ struct TwoLevelIteratorState { // Uses a supplied function to convert an index_iter value into // an iterator over the contents of the corresponding block. // Note: this function expects first_level_iter was not created using the arena -extern InternalIteratorBase* NewTwoLevelIterator( +extern InternalIteratorBase* NewTwoLevelIterator( TwoLevelIteratorState* state, - InternalIteratorBase* first_level_iter); + InternalIteratorBase* first_level_iter); } // namespace rocksdb diff --git a/util/fault_injection_test_env.cc b/test_util/fault_injection_test_env.cc similarity index 84% rename from util/fault_injection_test_env.cc rename to test_util/fault_injection_test_env.cc index 9cad23871b6..5c47b7ea455 100644 --- a/util/fault_injection_test_env.cc +++ b/test_util/fault_injection_test_env.cc @@ -11,7 +11,7 @@ // the last "sync". It then checks for data loss errors by purposely dropping // file data (or entire files) not protected by a "sync". -#include "util/fault_injection_test_env.h" +#include "test_util/fault_injection_test_env.h" #include #include @@ -98,6 +98,9 @@ Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const { } Status TestDirectory::Fsync() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } env_->SyncDir(dirname_); return dir_->Fsync(); } @@ -158,6 +161,53 @@ Status TestWritableFile::Sync() { return Status::OK(); } +TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/, + std::unique_ptr&& f, + FaultInjectionTestEnv* env) + : target_(std::move(f)), file_opened_(true), env_(env) { + assert(target_ != nullptr); +} + +TestRandomRWFile::~TestRandomRWFile() { + if (file_opened_) { + Close(); + } +} + +Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Write(offset, data); +} + +Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Read(offset, n, result, scratch); +} + +Status TestRandomRWFile::Close() { + file_opened_ = false; + return target_->Close(); +} + +Status TestRandomRWFile::Flush() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Flush(); +} + +Status TestRandomRWFile::Sync() { + if (!env_->IsFilesystemActive()) { + return env_->GetError(); + } + return target_->Sync(); +} + Status FaultInjectionTestEnv::NewDirectory(const std::string& name, std::unique_ptr* result) { std::unique_ptr r; @@ -220,6 +270,27 @@ Status FaultInjectionTestEnv::ReopenWritableFile( return s; } +Status FaultInjectionTestEnv::NewRandomRWFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& soptions) { + if (!IsFilesystemActive()) { + return GetError(); + } + Status s = target()->NewRandomRWFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestRandomRWFile(fname, std::move(*result), this)); + // WritableFileWriter* file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; +} + Status FaultInjectionTestEnv::NewRandomAccessFile( const std::string& fname, std::unique_ptr* result, const EnvOptions& soptions) { @@ -238,7 +309,6 @@ Status FaultInjectionTestEnv::DeleteFile(const std::string& f) { fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(), s.ToString().c_str()); } - assert(s.ok()); if (s.ok()) { UntrackFile(f); } diff --git a/util/fault_injection_test_env.h b/test_util/fault_injection_test_env.h similarity index 85% rename from util/fault_injection_test_env.h rename to test_util/fault_injection_test_env.h index a39e5b71e9d..b68b3faedce 100644 --- a/util/fault_injection_test_env.h +++ b/test_util/fault_injection_test_env.h @@ -19,9 +19,9 @@ #include "db/version_set.h" #include "env/mock_env.h" +#include "file/filename.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "util/filename.h" #include "util/mutexlock.h" #include "util/random.h" @@ -82,6 +82,31 @@ class TestWritableFile : public WritableFile { FaultInjectionTestEnv* env_; }; +// A wrapper around WritableFileWriter* file +// is written to or sync'ed. +class TestRandomRWFile : public RandomRWFile { + public: + explicit TestRandomRWFile(const std::string& fname, + std::unique_ptr&& f, + FaultInjectionTestEnv* env); + virtual ~TestRandomRWFile(); + Status Write(uint64_t offset, const Slice& data) override; + Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + Status Close() override; + Status Flush() override; + Status Sync() override; + size_t GetRequiredBufferAlignment() const override { + return target_->GetRequiredBufferAlignment(); + } + bool use_direct_io() const override { return target_->use_direct_io(); }; + + private: + std::unique_ptr target_; + bool file_opened_; + FaultInjectionTestEnv* env_; +}; + class TestDirectory : public Directory { public: explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname, @@ -114,6 +139,10 @@ class FaultInjectionTestEnv : public EnvWrapper { std::unique_ptr* result, const EnvOptions& soptions) override; + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& soptions) override; + Status NewRandomAccessFile(const std::string& fname, std::unique_ptr* result, const EnvOptions& soptions) override; diff --git a/util/mock_time_env.h b/test_util/mock_time_env.h similarity index 100% rename from util/mock_time_env.h rename to test_util/mock_time_env.h diff --git a/util/sync_point.cc b/test_util/sync_point.cc similarity index 95% rename from util/sync_point.cc rename to test_util/sync_point.cc index 4599c256d9f..a09be9e8fa1 100644 --- a/util/sync_point.cc +++ b/test_util/sync_point.cc @@ -3,8 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/sync_point.h" -#include "util/sync_point_impl.h" +#include "test_util/sync_point.h" +#include "test_util/sync_point_impl.h" int rocksdb_kill_odds = 0; std::vector rocksdb_kill_prefix_blacklist; diff --git a/util/sync_point.h b/test_util/sync_point.h similarity index 100% rename from util/sync_point.h rename to test_util/sync_point.h diff --git a/util/sync_point_impl.cc b/test_util/sync_point_impl.cc similarity index 98% rename from util/sync_point_impl.cc rename to test_util/sync_point_impl.cc index 248c381a328..db44f472a05 100644 --- a/util/sync_point_impl.cc +++ b/test_util/sync_point_impl.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/sync_point_impl.h" +#include "test_util/sync_point_impl.h" #ifndef NDEBUG namespace rocksdb { diff --git a/util/sync_point_impl.h b/test_util/sync_point_impl.h similarity index 98% rename from util/sync_point_impl.h rename to test_util/sync_point_impl.h index 3c7e7049183..d96d7325786 100644 --- a/util/sync_point_impl.h +++ b/test_util/sync_point_impl.h @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "util/sync_point.h" +#include "test_util/sync_point.h" #include #include diff --git a/util/testharness.cc b/test_util/testharness.cc similarity index 97% rename from util/testharness.cc rename to test_util/testharness.cc index 8f5eb2a4d6e..62cc535a198 100644 --- a/util/testharness.cc +++ b/test_util/testharness.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/testharness.h" +#include "test_util/testharness.h" #include #include diff --git a/util/testharness.h b/test_util/testharness.h similarity index 100% rename from util/testharness.h rename to test_util/testharness.h diff --git a/util/testutil.cc b/test_util/testutil.cc similarity index 90% rename from util/testutil.cc rename to test_util/testutil.cc index b6493258f60..7c90c14efee 100644 --- a/util/testutil.cc +++ b/test_util/testutil.cc @@ -7,20 +7,24 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/testutil.h" +#include "test_util/testutil.h" +#include #include +#include #include #include "db/memtable_list.h" +#include "file/random_access_file_reader.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" #include "port/port.h" -#include "util/file_reader_writer.h" namespace rocksdb { namespace test { const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version; -const uint32_t kLatestFormatVersion = 4u; +const uint32_t kLatestFormatVersion = 5u; Slice RandomString(Random* rnd, int len, std::string* dst) { dst->resize(len); @@ -162,7 +166,11 @@ std::string RandomName(Random* rnd, const size_t len) { } CompressionType RandomCompressionType(Random* rnd) { - return static_cast(rnd->Uniform(6)); + auto ret = static_cast(rnd->Uniform(6)); + while (!CompressionTypeSupported(ret)) { + ret = static_cast((static_cast(ret) + 1) % 6); + } + return ret; } void RandomCompressionTypeVector(const size_t count, @@ -193,8 +201,12 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) { opt.cache_index_and_filter_blocks = rnd->Uniform(2); opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2); opt.pin_top_level_index_and_filter = rnd->Uniform(2); - opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch - : BlockBasedTableOptions::kHashSearch; + using IndexType = BlockBasedTableOptions::IndexType; + const std::array index_types = { + {IndexType::kBinarySearch, IndexType::kHashSearch, + IndexType::kTwoLevelIndexSearch, IndexType::kBinarySearchWithFirstKey}}; + opt.index_type = + index_types[rnd->Uniform(static_cast(index_types.size()))]; opt.hash_index_allow_collision = rnd->Uniform(2); opt.checksum = static_cast(rnd->Uniform(3)); opt.block_size = rnd->Uniform(10000000); @@ -293,7 +305,8 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) { db_opt->stats_dump_period_sec = rnd->Uniform(100000); } -void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) { +void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, + Random* rnd) { cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4)); // boolean options @@ -322,6 +335,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) { cf_opt->max_mem_compaction_level = rnd->Uniform(100); cf_opt->max_write_buffer_number = rnd->Uniform(100); cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100); + cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000); cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100); cf_opt->num_levels = rnd->Uniform(100); cf_opt->target_file_size_multiplier = rnd->Uniform(100); @@ -345,8 +359,10 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) { // uint64_t options static const uint64_t uint_max = static_cast(UINT_MAX); - cf_opt->ttl = uint_max + rnd->Uniform(10000); - cf_opt->periodic_compaction_seconds = uint_max + rnd->Uniform(10000); + cf_opt->ttl = + db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0; + cf_opt->periodic_compaction_seconds = + db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0; cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000); cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000); cf_opt->max_compaction_bytes = @@ -414,5 +430,22 @@ bool IsDirectIOSupported(Env* env, const std::string& dir) { return s.ok(); } +size_t GetLinesCount(const std::string& fname, const std::string& pattern) { + std::stringstream ssbuf; + std::string line; + size_t count = 0; + + std::ifstream inFile(fname.c_str()); + ssbuf << inFile.rdbuf(); + + while (getline(ssbuf, line)) { + if (line.find(pattern) != std::string::npos) { + count++; + } + } + + return count; +} + } // namespace test } // namespace rocksdb diff --git a/util/testutil.h b/test_util/testutil.h similarity index 81% rename from util/testutil.h rename to test_util/testutil.h index 2aab3df72c4..716ae7d26e8 100644 --- a/util/testutil.h +++ b/test_util/testutil.h @@ -20,9 +20,9 @@ #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/table.h" -#include "table/block_based_table_factory.h" +#include "table/block_based/block_based_table_factory.h" #include "table/internal_iterator.h" -#include "table/plain_table_factory.h" +#include "table/plain/plain_table_factory.h" #include "util/mutexlock.h" #include "util/random.h" @@ -492,13 +492,11 @@ inline std::string EncodeInt(uint64_t x) { return result; } -class StringEnv : public EnvWrapper { - public: class SeqStringSource : public SequentialFile { public: explicit SeqStringSource(const std::string& data) : data_(data), offset_(0) {} - ~SeqStringSource() {} + ~SeqStringSource() override {} Status Read(size_t n, Slice* result, char* scratch) override { std::string output; if (offset_ < data_.size()) { @@ -527,129 +525,136 @@ class StringEnv : public EnvWrapper { size_t offset_; }; - class StringSink : public WritableFile { + class StringEnv : public EnvWrapper { public: - explicit StringSink(std::string* contents) - : WritableFile(), contents_(contents) {} - virtual Status Truncate(uint64_t size) override { - contents_->resize(static_cast(size)); - return Status::OK(); - } - virtual Status Close() override { return Status::OK(); } - virtual Status Flush() override { return Status::OK(); } - virtual Status Sync() override { return Status::OK(); } - virtual Status Append(const Slice& slice) override { - contents_->append(slice.data(), slice.size()); - return Status::OK(); - } + class StringSink : public WritableFile { + public: + explicit StringSink(std::string* contents) + : WritableFile(), contents_(contents) {} + virtual Status Truncate(uint64_t size) override { + contents_->resize(static_cast(size)); + return Status::OK(); + } + virtual Status Close() override { return Status::OK(); } + virtual Status Flush() override { return Status::OK(); } + virtual Status Sync() override { return Status::OK(); } + virtual Status Append(const Slice& slice) override { + contents_->append(slice.data(), slice.size()); + return Status::OK(); + } - private: - std::string* contents_; - }; + private: + std::string* contents_; + }; - explicit StringEnv(Env* t) : EnvWrapper(t) {} - virtual ~StringEnv() {} + explicit StringEnv(Env* t) : EnvWrapper(t) {} + ~StringEnv() override {} - const std::string& GetContent(const std::string& f) { return files_[f]; } + const std::string& GetContent(const std::string& f) { return files_[f]; } - const Status WriteToNewFile(const std::string& file_name, - const std::string& content) { - std::unique_ptr r; - auto s = NewWritableFile(file_name, &r, EnvOptions()); - if (!s.ok()) { - return s; + const Status WriteToNewFile(const std::string& file_name, + const std::string& content) { + std::unique_ptr r; + auto s = NewWritableFile(file_name, &r, EnvOptions()); + if (!s.ok()) { + return s; + } + r->Append(content); + r->Flush(); + r->Close(); + assert(files_[file_name] == content); + return Status::OK(); } - r->Append(content); - r->Flush(); - r->Close(); - assert(files_[file_name] == content); - return Status::OK(); - } - // The following text is boilerplate that forwards all methods to target() - Status NewSequentialFile(const std::string& f, - std::unique_ptr* r, + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, + std::unique_ptr* r, + const EnvOptions& /*options*/) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return Status::NotFound("The specified file does not exist", f); + } + r->reset(new SeqStringSource(iter->second)); + return Status::OK(); + } + Status NewRandomAccessFile(const std::string& /*f*/, + std::unique_ptr* /*r*/, + const EnvOptions& /*options*/) override { + return Status::NotSupported(); + } + Status NewWritableFile(const std::string& f, + std::unique_ptr* r, const EnvOptions& /*options*/) override { - auto iter = files_.find(f); - if (iter == files_.end()) { - return Status::NotFound("The specified file does not exist", f); + auto iter = files_.find(f); + if (iter != files_.end()) { + return Status::IOError("The specified file already exists", f); + } + r->reset(new StringSink(&files_[f])); + return Status::OK(); } - r->reset(new SeqStringSource(iter->second)); - return Status::OK(); - } - Status NewRandomAccessFile(const std::string& /*f*/, - std::unique_ptr* /*r*/, - const EnvOptions& /*options*/) override { - return Status::NotSupported(); - } - Status NewWritableFile(const std::string& f, std::unique_ptr* r, - const EnvOptions& /*options*/) override { - auto iter = files_.find(f); - if (iter != files_.end()) { - return Status::IOError("The specified file already exists", f); + virtual Status NewDirectory( + const std::string& /*name*/, + std::unique_ptr* /*result*/) override { + return Status::NotSupported(); } - r->reset(new StringSink(&files_[f])); - return Status::OK(); - } - virtual Status NewDirectory(const std::string& /*name*/, - std::unique_ptr* /*result*/) override { - return Status::NotSupported(); - } - Status FileExists(const std::string& f) override { - if (files_.find(f) == files_.end()) { - return Status::NotFound(); + Status FileExists(const std::string& f) override { + if (files_.find(f) == files_.end()) { + return Status::NotFound(); + } + return Status::OK(); } - return Status::OK(); - } - Status GetChildren(const std::string& /*dir*/, - std::vector* /*r*/) override { - return Status::NotSupported(); - } - Status DeleteFile(const std::string& f) override { - files_.erase(f); - return Status::OK(); - } - Status CreateDir(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status CreateDirIfMissing(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status DeleteDir(const std::string& /*d*/) override { - return Status::NotSupported(); - } - Status GetFileSize(const std::string& f, uint64_t* s) override { - auto iter = files_.find(f); - if (iter == files_.end()) { - return Status::NotFound("The specified file does not exist:", f); + Status GetChildren(const std::string& /*dir*/, + std::vector* /*r*/) override { + return Status::NotSupported(); + } + Status DeleteFile(const std::string& f) override { + files_.erase(f); + return Status::OK(); + } + Status CreateDir(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status CreateDirIfMissing(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status DeleteDir(const std::string& /*d*/) override { + return Status::NotSupported(); + } + Status GetFileSize(const std::string& f, uint64_t* s) override { + auto iter = files_.find(f); + if (iter == files_.end()) { + return Status::NotFound("The specified file does not exist:", f); + } + *s = iter->second.size(); + return Status::OK(); } - *s = iter->second.size(); - return Status::OK(); - } - Status GetFileModificationTime(const std::string& /*fname*/, - uint64_t* /*file_mtime*/) override { - return Status::NotSupported(); - } + Status GetFileModificationTime(const std::string& /*fname*/, + uint64_t* /*file_mtime*/) override { + return Status::NotSupported(); + } - Status RenameFile(const std::string& /*s*/, - const std::string& /*t*/) override { - return Status::NotSupported(); - } + Status RenameFile(const std::string& /*s*/, + const std::string& /*t*/) override { + return Status::NotSupported(); + } - Status LinkFile(const std::string& /*s*/, const std::string& /*t*/) override { - return Status::NotSupported(); - } + Status LinkFile(const std::string& /*s*/, + const std::string& /*t*/) override { + return Status::NotSupported(); + } - Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override { - return Status::NotSupported(); - } + Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override { + return Status::NotSupported(); + } - Status UnlockFile(FileLock* /*l*/) override { return Status::NotSupported(); } + Status UnlockFile(FileLock* /*l*/) override { + return Status::NotSupported(); + } - protected: - std::unordered_map files_; -}; + protected: + std::unordered_map files_; + }; // Randomly initialize the given DBOptions void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); @@ -657,7 +662,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd); // Randomly initialize the given ColumnFamilyOptions // Note that the caller is responsible for releasing non-null // cf_opt->compaction_filter. -void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd); +void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions&, Random* rnd); // A dummy merge operator which can change its name class ChanglingMergeOperator : public MergeOperator { @@ -750,5 +755,8 @@ Status DestroyDir(Env* env, const std::string& dir); bool IsDirectIOSupported(Env* env, const std::string& dir); +// Return the number of lines where a given pattern was found in a file. +size_t GetLinesCount(const std::string& fname, const std::string& pattern); + } // namespace test } // namespace rocksdb diff --git a/util/transaction_test_util.cc b/test_util/transaction_test_util.cc similarity index 97% rename from util/transaction_test_util.cc rename to test_util/transaction_test_util.cc index 30cff11e14d..7043eb2d789 100644 --- a/util/transaction_test_util.cc +++ b/test_util/transaction_test_util.cc @@ -4,14 +4,10 @@ // (found in the LICENSE.Apache file in the root directory). #ifndef ROCKSDB_LITE -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif +#include "test_util/transaction_test_util.h" -#include "util/transaction_test_util.h" - -#include #include +#include #include #include #include @@ -24,7 +20,7 @@ #include "db/dbformat.h" #include "db/snapshot_impl.h" -#include "util/logging.h" +#include "logging/logging.h" #include "util/random.h" #include "util/string_util.h" @@ -205,6 +201,12 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn, ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, "Prepare of %" PRIu64 " %s (%s)", txn->GetId(), s.ToString().c_str(), txn->GetName().c_str()); + if (rand_->OneIn(20)) { + // This currently only tests the mechanics of writing commit time + // write batch so the exact values would not matter. + s = txn_->GetCommitTimeWriteBatch()->Put("cat", "dog"); + assert(s.ok()); + } db->GetDBOptions().env->SleepForMicroseconds( static_cast(cmt_delay_ms_ * 1000)); } diff --git a/util/transaction_test_util.h b/test_util/transaction_test_util.h similarity index 100% rename from util/transaction_test_util.h rename to test_util/transaction_test_util.h diff --git a/third-party/folly/folly/CPortability.h b/third-party/folly/folly/CPortability.h new file mode 100644 index 00000000000..56cb6b1a58c --- /dev/null +++ b/third-party/folly/folly/CPortability.h @@ -0,0 +1,27 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +/** + * Macro for marking functions as having public visibility. + */ +#if defined(__GNUC__) +#define FOLLY_EXPORT __attribute__((__visibility__("default"))) +#else +#define FOLLY_EXPORT +#endif + +#if defined(__has_feature) +#define FOLLY_HAS_FEATURE(...) __has_feature(__VA_ARGS__) +#else +#define FOLLY_HAS_FEATURE(...) 0 +#endif + +#if FOLLY_HAS_FEATURE(thread_sanitizer) || __SANITIZE_THREAD__ +#ifndef FOLLY_SANITIZE_THREAD +#define FOLLY_SANITIZE_THREAD 1 +#endif +#endif diff --git a/third-party/folly/folly/ConstexprMath.h b/third-party/folly/folly/ConstexprMath.h new file mode 100644 index 00000000000..f09167e0d42 --- /dev/null +++ b/third-party/folly/folly/ConstexprMath.h @@ -0,0 +1,45 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +namespace folly { +template +constexpr T constexpr_max(T a) { + return a; +} +template +constexpr T constexpr_max(T a, T b, Ts... ts) { + return b < a ? constexpr_max(a, ts...) : constexpr_max(b, ts...); +} + +namespace detail { +template +constexpr T constexpr_log2_(T a, T e) { + return e == T(1) ? a : constexpr_log2_(a + T(1), e / T(2)); +} + +template +constexpr T constexpr_log2_ceil_(T l2, T t) { + return l2 + T(T(1) << l2 < t ? 1 : 0); +} + +template +constexpr T constexpr_square_(T t) { + return t * t; +} +} // namespace detail + +template +constexpr T constexpr_log2(T t) { + return detail::constexpr_log2_(T(0), t); +} + +template +constexpr T constexpr_log2_ceil(T t) { + return detail::constexpr_log2_ceil_(constexpr_log2(t), t); +} + +} // namespace folly diff --git a/third-party/folly/folly/Indestructible.h b/third-party/folly/folly/Indestructible.h new file mode 100644 index 00000000000..68249d86512 --- /dev/null +++ b/third-party/folly/folly/Indestructible.h @@ -0,0 +1,166 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include + +namespace folly { + +/*** + * Indestructible + * + * When you need a Meyers singleton that will not get destructed, even at + * shutdown, and you also want the object stored inline. + * + * Use like: + * + * void doSomethingWithExpensiveData(); + * + * void doSomethingWithExpensiveData() { + * static const Indestructible> data{ + * map{{"key1", 17}, {"key2", 19}, {"key3", 23}}, + * }; + * callSomethingTakingAMapByRef(*data); + * } + * + * This should be used only for Meyers singletons, and, even then, only when + * the instance does not need to be destructed ever. + * + * This should not be used more generally, e.g., as member fields, etc. + * + * This is designed as an alternative, but with one fewer allocation at + * construction time and one fewer pointer dereference at access time, to the + * Meyers singleton pattern of: + * + * void doSomethingWithExpensiveData() { + * static const auto data = // never `delete`d + * new map{{"key1", 17}, {"key2", 19}, {"key3", 23}}; + * callSomethingTakingAMapByRef(*data); + * } + */ + +template +class Indestructible final { + public: + template + constexpr Indestructible() noexcept(noexcept(T())) {} + + /** + * Constructor accepting a single argument by forwarding reference, this + * allows using list initialzation without the overhead of things like + * in_place, etc and also works with std::initializer_list constructors + * which can't be deduced, the default parameter helps there. + * + * auto i = folly::Indestructible>{{{1, 2}}}; + * + * This provides convenience + * + * There are two versions of this constructor - one for when the element is + * implicitly constructible from the given argument and one for when the + * type is explicitly but not implicitly constructible from the given + * argument. + */ + template < + typename U = T, + _t::value>>* = nullptr, + _t, remove_cvref_t>::value>>* = + nullptr, + _t::value>>* = nullptr> + explicit constexpr Indestructible(U&& u) noexcept( + noexcept(T(std::declval()))) + : storage_(std::forward(u)) {} + template < + typename U = T, + _t::value>>* = nullptr, + _t, remove_cvref_t>::value>>* = + nullptr, + _t::value>>* = nullptr> + /* implicit */ constexpr Indestructible(U&& u) noexcept( + noexcept(T(std::declval()))) + : storage_(std::forward(u)) {} + + template ()...))> + explicit constexpr Indestructible(Args&&... args) noexcept( + noexcept(T(std::declval()...))) + : storage_(std::forward(args)...) {} + template < + typename U, + typename... Args, + typename = decltype( + T(std::declval&>(), + std::declval()...))> + explicit constexpr Indestructible(std::initializer_list il, Args... args) noexcept( + noexcept( + T(std::declval&>(), + std::declval()...))) + : storage_(il, std::forward(args)...) {} + + ~Indestructible() = default; + + Indestructible(Indestructible const&) = delete; + Indestructible& operator=(Indestructible const&) = delete; + + Indestructible(Indestructible&& other) noexcept( + noexcept(T(std::declval()))) + : storage_(std::move(other.storage_.value)) { + other.erased_ = true; + } + Indestructible& operator=(Indestructible&& other) noexcept( + noexcept(T(std::declval()))) { + storage_.value = std::move(other.storage_.value); + other.erased_ = true; + } + + T* get() noexcept { + check(); + return &storage_.value; + } + T const* get() const noexcept { + check(); + return &storage_.value; + } + T& operator*() noexcept { + return *get(); + } + T const& operator*() const noexcept { + return *get(); + } + T* operator->() noexcept { + return get(); + } + T const* operator->() const noexcept { + return get(); + } + + private: + void check() const noexcept { + assert(!erased_); + } + + union Storage { + T value; + + template + constexpr Storage() noexcept(noexcept(T())) : value() {} + + template ()...))> + explicit constexpr Storage(Args&&... args) noexcept( + noexcept(T(std::declval()...))) + : value(std::forward(args)...) {} + + ~Storage() {} + }; + + Storage storage_{}; + bool erased_{false}; +}; +} // namespace folly diff --git a/third-party/folly/folly/Optional.h b/third-party/folly/folly/Optional.h new file mode 100644 index 00000000000..ee12467dda7 --- /dev/null +++ b/third-party/folly/folly/Optional.h @@ -0,0 +1,570 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +/* + * Optional - For conditional initialization of values, like boost::optional, + * but with support for move semantics and emplacement. Reference type support + * has not been included due to limited use cases and potential confusion with + * semantics of assignment: Assigning to an optional reference could quite + * reasonably copy its value or redirect the reference. + * + * Optional can be useful when a variable might or might not be needed: + * + * Optional maybeLogger = ...; + * if (maybeLogger) { + * maybeLogger->log("hello"); + * } + * + * Optional enables a 'null' value for types which do not otherwise have + * nullability, especially useful for parameter passing: + * + * void testIterator(const unique_ptr& it, + * initializer_list idsExpected, + * Optional> ranksExpected = none) { + * for (int i = 0; it->next(); ++i) { + * EXPECT_EQ(it->doc().id(), idsExpected[i]); + * if (ranksExpected) { + * EXPECT_EQ(it->doc().rank(), (*ranksExpected)[i]); + * } + * } + * } + * + * Optional models OptionalPointee, so calling 'get_pointer(opt)' will return a + * pointer to nullptr if the 'opt' is empty, and a pointer to the value if it is + * not: + * + * Optional maybeInt = ...; + * if (int* v = get_pointer(maybeInt)) { + * cout << *v << endl; + * } + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace folly { + +template +class Optional; + +namespace detail { +template +struct OptionalPromiseReturn; +} // namespace detail + +struct None { + enum class _secret { _token }; + + /** + * No default constructor to support both `op = {}` and `op = none` + * as syntax for clearing an Optional, just like std::nullopt_t. + */ + constexpr explicit None(_secret) {} +}; +constexpr None none{None::_secret::_token}; + +class FOLLY_EXPORT OptionalEmptyException : public std::runtime_error { + public: + OptionalEmptyException() + : std::runtime_error("Empty Optional cannot be unwrapped") {} +}; + +template +class Optional { + public: + typedef Value value_type; + + static_assert( + !std::is_reference::value, + "Optional may not be used with reference types"); + static_assert( + !std::is_abstract::value, + "Optional may not be used with abstract types"); + + Optional() noexcept {} + + Optional(const Optional& src) noexcept( + std::is_nothrow_copy_constructible::value) { + if (src.hasValue()) { + construct(src.value()); + } + } + + Optional(Optional&& src) noexcept( + std::is_nothrow_move_constructible::value) { + if (src.hasValue()) { + construct(std::move(src.value())); + src.clear(); + } + } + + /* implicit */ Optional(const None&) noexcept {} + + /* implicit */ Optional(Value&& newValue) noexcept( + std::is_nothrow_move_constructible::value) { + construct(std::move(newValue)); + } + + /* implicit */ Optional(const Value& newValue) noexcept( + std::is_nothrow_copy_constructible::value) { + construct(newValue); + } + + template + explicit Optional(in_place_t, Args&&... args) noexcept( + std::is_nothrow_constructible::value) + : Optional{PrivateConstructor{}, std::forward(args)...} {} + + template + explicit Optional( + in_place_t, + std::initializer_list il, + Args&&... args) noexcept(std:: + is_nothrow_constructible< + Value, + std::initializer_list, + Args...>::value) + : Optional{PrivateConstructor{}, il, std::forward(args)...} {} + + // Used only when an Optional is used with coroutines on MSVC + /* implicit */ Optional(const detail::OptionalPromiseReturn& p) + : Optional{} { + p.promise_->value_ = this; + } + + void assign(const None&) { + clear(); + } + + void assign(Optional&& src) { + if (this != &src) { + if (src.hasValue()) { + assign(std::move(src.value())); + src.clear(); + } else { + clear(); + } + } + } + + void assign(const Optional& src) { + if (src.hasValue()) { + assign(src.value()); + } else { + clear(); + } + } + + void assign(Value&& newValue) { + if (hasValue()) { + storage_.value = std::move(newValue); + } else { + construct(std::move(newValue)); + } + } + + void assign(const Value& newValue) { + if (hasValue()) { + storage_.value = newValue; + } else { + construct(newValue); + } + } + + Optional& operator=(None) noexcept { + reset(); + return *this; + } + + template + Optional& operator=(Arg&& arg) { + assign(std::forward(arg)); + return *this; + } + + Optional& operator=(Optional&& other) noexcept( + std::is_nothrow_move_assignable::value) { + assign(std::move(other)); + return *this; + } + + Optional& operator=(const Optional& other) noexcept( + std::is_nothrow_copy_assignable::value) { + assign(other); + return *this; + } + + template + Value& emplace(Args&&... args) { + clear(); + construct(std::forward(args)...); + return value(); + } + + template + typename std::enable_if< + std::is_constructible&, Args&&...>::value, + Value&>::type + emplace(std::initializer_list ilist, Args&&... args) { + clear(); + construct(ilist, std::forward(args)...); + return value(); + } + + void reset() noexcept { + storage_.clear(); + } + + void clear() noexcept { + reset(); + } + + void swap(Optional& that) noexcept(IsNothrowSwappable::value) { + if (hasValue() && that.hasValue()) { + using std::swap; + swap(value(), that.value()); + } else if (hasValue()) { + that.emplace(std::move(value())); + reset(); + } else if (that.hasValue()) { + emplace(std::move(that.value())); + that.reset(); + } + } + + const Value& value() const& { + require_value(); + return storage_.value; + } + + Value& value() & { + require_value(); + return storage_.value; + } + + Value&& value() && { + require_value(); + return std::move(storage_.value); + } + + const Value&& value() const&& { + require_value(); + return std::move(storage_.value); + } + + const Value* get_pointer() const& { + return storage_.hasValue ? &storage_.value : nullptr; + } + Value* get_pointer() & { + return storage_.hasValue ? &storage_.value : nullptr; + } + Value* get_pointer() && = delete; + + bool has_value() const noexcept { + return storage_.hasValue; + } + + bool hasValue() const noexcept { + return has_value(); + } + + explicit operator bool() const noexcept { + return has_value(); + } + + const Value& operator*() const& { + return value(); + } + Value& operator*() & { + return value(); + } + const Value&& operator*() const&& { + return std::move(value()); + } + Value&& operator*() && { + return std::move(value()); + } + + const Value* operator->() const { + return &value(); + } + Value* operator->() { + return &value(); + } + + // Return a copy of the value if set, or a given default if not. + template + Value value_or(U&& dflt) const& { + if (storage_.hasValue) { + return storage_.value; + } + + return std::forward(dflt); + } + + template + Value value_or(U&& dflt) && { + if (storage_.hasValue) { + return std::move(storage_.value); + } + + return std::forward(dflt); + } + + private: + template + friend Optional<_t>> make_optional(T&&); + template + friend Optional make_optional(Args&&... args); + template + friend Optional make_optional(std::initializer_list, As&&...); + + /** + * Construct the optional in place, this is duplicated as a non-explicit + * constructor to allow returning values that are non-movable from + * make_optional using list initialization. + * + * Until C++17, at which point this will become unnecessary because of + * specified prvalue elision. + */ + struct PrivateConstructor { + explicit PrivateConstructor() = default; + }; + template + Optional(PrivateConstructor, Args&&... args) noexcept( + std::is_constructible::value) { + construct(std::forward(args)...); + } + + void require_value() const { + if (!storage_.hasValue) { + throw OptionalEmptyException{}; + } + } + + template + void construct(Args&&... args) { + const void* ptr = &storage_.value; + // For supporting const types. + new (const_cast(ptr)) Value(std::forward(args)...); + storage_.hasValue = true; + } + + struct StorageTriviallyDestructible { + union { + char emptyState; + Value value; + }; + bool hasValue; + + StorageTriviallyDestructible() + : emptyState('\0'), hasValue{false} {} + void clear() { + hasValue = false; + } + }; + + struct StorageNonTriviallyDestructible { + union { + char emptyState; + Value value; + }; + bool hasValue; + + StorageNonTriviallyDestructible() : hasValue{false} {} + ~StorageNonTriviallyDestructible() { + clear(); + } + + void clear() { + if (hasValue) { + hasValue = false; + value.~Value(); + } + } + }; + + using Storage = typename std::conditional< + std::is_trivially_destructible::value, + StorageTriviallyDestructible, + StorageNonTriviallyDestructible>::type; + + Storage storage_; +}; + +template +const T* get_pointer(const Optional& opt) { + return opt.get_pointer(); +} + +template +T* get_pointer(Optional& opt) { + return opt.get_pointer(); +} + +template +void swap(Optional& a, Optional& b) noexcept(noexcept(a.swap(b))) { + a.swap(b); +} + +template +Optional<_t>> make_optional(T&& v) { + using PrivateConstructor = + typename folly::Optional<_t>>::PrivateConstructor; + return {PrivateConstructor{}, std::forward(v)}; +} + +template +folly::Optional make_optional(Args&&... args) { + using PrivateConstructor = typename folly::Optional::PrivateConstructor; + return {PrivateConstructor{}, std::forward(args)...}; +} + +template +folly::Optional make_optional( + std::initializer_list il, + Args&&... args) { + using PrivateConstructor = typename folly::Optional::PrivateConstructor; + return {PrivateConstructor{}, il, std::forward(args)...}; +} + +/////////////////////////////////////////////////////////////////////////////// +// Comparisons. + +template +bool operator==(const Optional& a, const V& b) { + return a.hasValue() && a.value() == b; +} + +template +bool operator!=(const Optional& a, const V& b) { + return !(a == b); +} + +template +bool operator==(const U& a, const Optional& b) { + return b.hasValue() && b.value() == a; +} + +template +bool operator!=(const U& a, const Optional& b) { + return !(a == b); +} + +template +bool operator==(const Optional& a, const Optional& b) { + if (a.hasValue() != b.hasValue()) { + return false; + } + if (a.hasValue()) { + return a.value() == b.value(); + } + return true; +} + +template +bool operator!=(const Optional& a, const Optional& b) { + return !(a == b); +} + +template +bool operator<(const Optional& a, const Optional& b) { + if (a.hasValue() != b.hasValue()) { + return a.hasValue() < b.hasValue(); + } + if (a.hasValue()) { + return a.value() < b.value(); + } + return false; +} + +template +bool operator>(const Optional& a, const Optional& b) { + return b < a; +} + +template +bool operator<=(const Optional& a, const Optional& b) { + return !(b < a); +} + +template +bool operator>=(const Optional& a, const Optional& b) { + return !(a < b); +} + +// Suppress comparability of Optional with T, despite implicit conversion. +template +bool operator<(const Optional&, const V& other) = delete; +template +bool operator<=(const Optional&, const V& other) = delete; +template +bool operator>=(const Optional&, const V& other) = delete; +template +bool operator>(const Optional&, const V& other) = delete; +template +bool operator<(const V& other, const Optional&) = delete; +template +bool operator<=(const V& other, const Optional&) = delete; +template +bool operator>=(const V& other, const Optional&) = delete; +template +bool operator>(const V& other, const Optional&) = delete; + +// Comparisons with none +template +bool operator==(const Optional& a, None) noexcept { + return !a.hasValue(); +} +template +bool operator==(None, const Optional& a) noexcept { + return !a.hasValue(); +} +template +bool operator<(const Optional&, None) noexcept { + return false; +} +template +bool operator<(None, const Optional& a) noexcept { + return a.hasValue(); +} +template +bool operator>(const Optional& a, None) noexcept { + return a.hasValue(); +} +template +bool operator>(None, const Optional&) noexcept { + return false; +} +template +bool operator<=(None, const Optional&) noexcept { + return true; +} +template +bool operator<=(const Optional& a, None) noexcept { + return !a.hasValue(); +} +template +bool operator>=(const Optional&, None) noexcept { + return true; +} +template +bool operator>=(None, const Optional& a) noexcept { + return !a.hasValue(); +} + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace folly diff --git a/third-party/folly/folly/Portability.h b/third-party/folly/folly/Portability.h new file mode 100644 index 00000000000..61c05ff2254 --- /dev/null +++ b/third-party/folly/folly/Portability.h @@ -0,0 +1,84 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#if defined(__arm__) +#define FOLLY_ARM 1 +#else +#define FOLLY_ARM 0 +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define FOLLY_X64 1 +#else +#define FOLLY_X64 0 +#endif + +#if defined(__aarch64__) +#define FOLLY_AARCH64 1 +#else +#define FOLLY_AARCH64 0 +#endif + +#if defined(__powerpc64__) +#define FOLLY_PPC64 1 +#else +#define FOLLY_PPC64 0 +#endif + +#if defined(__has_builtin) +#define FOLLY_HAS_BUILTIN(...) __has_builtin(__VA_ARGS__) +#else +#define FOLLY_HAS_BUILTIN(...) 0 +#endif + +#if defined(__has_cpp_attribute) +#if __has_cpp_attribute(nodiscard) +#define FOLLY_NODISCARD [[nodiscard]] +#endif +#endif +#if !defined FOLLY_NODISCARD +#if defined(_MSC_VER) && (_MSC_VER >= 1700) +#define FOLLY_NODISCARD _Check_return_ +#elif defined(__GNUC__) +#define FOLLY_NODISCARD __attribute__((__warn_unused_result__)) +#else +#define FOLLY_NODISCARD +#endif +#endif + +namespace folly { +constexpr bool kIsArchArm = FOLLY_ARM == 1; +constexpr bool kIsArchAmd64 = FOLLY_X64 == 1; +constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1; +constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1; +} // namespace folly + +namespace folly { +#ifdef NDEBUG +constexpr auto kIsDebug = false; +#else +constexpr auto kIsDebug = true; +#endif +} // namespace folly + +namespace folly { +#if defined(_MSC_VER) +constexpr bool kIsMsvc = true; +#else +constexpr bool kIsMsvc = false; +#endif +} // namespace folly + +namespace folly { +#if FOLLY_SANITIZE_THREAD +constexpr bool kIsSanitizeThread = true; +#else +constexpr bool kIsSanitizeThread = false; +#endif +} // namespace folly diff --git a/third-party/folly/folly/ScopeGuard.h b/third-party/folly/folly/ScopeGuard.h new file mode 100644 index 00000000000..71134406303 --- /dev/null +++ b/third-party/folly/folly/ScopeGuard.h @@ -0,0 +1,54 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +namespace folly { +namespace scope_guard_detail { +template +class ScopeGuardImpl { + public: + explicit ScopeGuardImpl(F&& f) : f_{std::forward(f)} {} + ~ScopeGuardImpl() { + f_(); + } + + private: + F f_; +}; + +enum class ScopeGuardEnum {}; +template >> +ScopeGuardImpl operator+(ScopeGuardEnum, Func&& func) { + return ScopeGuardImpl{std::forward(func)}; +} +} // namespace scope_guard_detail +} // namespace folly + +/** + * FB_ANONYMOUS_VARIABLE(str) introduces an identifier starting with + * str and ending with a number that varies with the line. + */ +#ifndef FB_ANONYMOUS_VARIABLE +#define FB_CONCATENATE_IMPL(s1, s2) s1##s2 +#define FB_CONCATENATE(s1, s2) FB_CONCATENATE_IMPL(s1, s2) +#ifdef __COUNTER__ +#define FB_ANONYMOUS_VARIABLE(str) \ + FB_CONCATENATE(FB_CONCATENATE(FB_CONCATENATE(str, __COUNTER__), _), __LINE__) +#else +#define FB_ANONYMOUS_VARIABLE(str) FB_CONCATENATE(str, __LINE__) +#endif +#endif + +#ifndef SCOPE_EXIT +#define SCOPE_EXIT \ + auto FB_ANONYMOUS_VARIABLE(SCOPE_EXIT_STATE) = \ + ::folly::scope_guard_detail::ScopeGuardEnum{} + [&]() noexcept +#endif diff --git a/third-party/folly/folly/Traits.h b/third-party/folly/folly/Traits.h new file mode 100644 index 00000000000..ea7e1eb1c05 --- /dev/null +++ b/third-party/folly/folly/Traits.h @@ -0,0 +1,152 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +namespace folly { + +#if !defined(_MSC_VER) +template +struct is_trivially_copyable + : std::integral_constant {}; +#else +template +using is_trivially_copyable = std::is_trivially_copyable; +#endif + +/*** + * _t + * + * Instead of: + * + * using decayed = typename std::decay::type; + * + * With the C++14 standard trait aliases, we could use: + * + * using decayed = std::decay_t; + * + * Without them, we could use: + * + * using decayed = _t>; + * + * Also useful for any other library with template types having dependent + * member types named `type`, like the standard trait types. + */ +template +using _t = typename T::type; + +/** + * type_t + * + * A type alias for the first template type argument. `type_t` is useful for + * controlling class-template and function-template partial specialization. + * + * Example: + * + * template + * class Container { + * public: + * template + * Container( + * type_t()...))>, + * Args&&...); + * }; + * + * void_t + * + * A type alias for `void`. `void_t` is useful for controling class-template + * and function-template partial specialization. + * + * Example: + * + * // has_value_type::value is true if T has a nested type `value_type` + * template + * struct has_value_type + * : std::false_type {}; + * + * template + * struct has_value_type> + * : std::true_type {}; + */ + +/** + * There is a bug in libstdc++, libc++, and MSVC's STL that causes it to + * ignore unused template parameter arguments in template aliases and does not + * cause substitution failures. This defect has been recorded here: + * http://open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#1558. + * + * This causes the implementation of std::void_t to be buggy, as it is likely + * defined as something like the following: + * + * template + * using void_t = void; + * + * This causes the compiler to ignore all the template arguments and does not + * help when one wants to cause substitution failures. Rather declarations + * which have void_t in orthogonal specializations are treated as the same. + * For example, assuming the possible `T` types are only allowed to have + * either the alias `one` or `two` and never both or none: + * + * template ::one>* = nullptr> + * void foo(T&&) {} + * template ::two>* = nullptr> + * void foo(T&&) {} + * + * The second foo() will be a redefinition because it conflicts with the first + * one; void_t does not cause substitution failures - the template types are + * just ignored. + */ + +namespace traits_detail { +template +struct type_t_ { + using type = T; +}; +} // namespace traits_detail + +template +using type_t = typename traits_detail::type_t_::type; +template +using void_t = type_t; + +/** + * A type trait to remove all const volatile and reference qualifiers on a + * type T + */ +template +struct remove_cvref { + using type = + typename std::remove_cv::type>::type; +}; +template +using remove_cvref_t = typename remove_cvref::type; + +template +struct IsNothrowSwappable + : std::integral_constant< + bool, + std::is_nothrow_move_constructible::value&& noexcept( + std::swap(std::declval(), std::declval()))> {}; + +template +struct Conjunction : std::true_type {}; +template +struct Conjunction : T {}; +template +struct Conjunction + : std::conditional, T>::type {}; + +template +struct Negation : std::integral_constant {}; + +template +using index_constant = std::integral_constant; + +} // namespace folly diff --git a/third-party/folly/folly/Unit.h b/third-party/folly/folly/Unit.h new file mode 100644 index 00000000000..c8cb77e2c37 --- /dev/null +++ b/third-party/folly/folly/Unit.h @@ -0,0 +1,59 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +namespace folly { + +/// In functional programming, the degenerate case is often called "unit". In +/// C++, "void" is often the best analogue. However, because of the syntactic +/// special-casing required for void, it is frequently a liability for template +/// metaprogramming. So, instead of writing specializations to handle cases like +/// SomeContainer, a library author may instead rule that out and simply +/// have library users use SomeContainer. Contained values may be ignored. +/// Much easier. +/// +/// "void" is the type that admits of no values at all. It is not possible to +/// construct a value of this type. +/// "unit" is the type that admits of precisely one unique value. It is +/// possible to construct a value of this type, but it is always the same value +/// every time, so it is uninteresting. +struct Unit { + constexpr bool operator==(const Unit& /*other*/) const { + return true; + } + constexpr bool operator!=(const Unit& /*other*/) const { + return false; + } +}; + +constexpr Unit unit{}; + +template +struct lift_unit { + using type = T; +}; +template <> +struct lift_unit { + using type = Unit; +}; +template +using lift_unit_t = typename lift_unit::type; + +template +struct drop_unit { + using type = T; +}; +template <> +struct drop_unit { + using type = void; +}; +template +using drop_unit_t = typename drop_unit::type; + +} // namespace folly + diff --git a/third-party/folly/folly/Utility.h b/third-party/folly/folly/Utility.h new file mode 100644 index 00000000000..7e43bdc2f17 --- /dev/null +++ b/third-party/folly/folly/Utility.h @@ -0,0 +1,141 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +namespace folly { + +/** + * Backports from C++17 of: + * std::in_place_t + * std::in_place_type_t + * std::in_place_index_t + * std::in_place + * std::in_place_type + * std::in_place_index + */ + +struct in_place_tag {}; +template +struct in_place_type_tag {}; +template +struct in_place_index_tag {}; + +using in_place_t = in_place_tag (&)(in_place_tag); +template +using in_place_type_t = in_place_type_tag (&)(in_place_type_tag); +template +using in_place_index_t = in_place_index_tag (&)(in_place_index_tag); + +inline in_place_tag in_place(in_place_tag = {}) { + return {}; +} +template +inline in_place_type_tag in_place_type(in_place_type_tag = {}) { + return {}; +} +template +inline in_place_index_tag in_place_index(in_place_index_tag = {}) { + return {}; +} + +template +T exchange(T& obj, U&& new_value) { + T old_value = std::move(obj); + obj = std::forward(new_value); + return old_value; +} + +namespace utility_detail { +template +struct make_seq_cat; +template < + template class S, + typename T, + T... Ta, + T... Tb, + T... Tc> +struct make_seq_cat, S, S> { + using type = + S; +}; + +// Not parameterizing by `template class, typename` because +// clang precisely v4.0 fails to compile that. Note that clang v3.9 and v5.0 +// handle that code correctly. +// +// For this to work, `S0` is required to be `Sequence` and `S1` is required +// to be `Sequence`. + +template +struct make_seq { + template + using apply = typename make_seq_cat< + typename make_seq::template apply, + typename make_seq::template apply, + typename make_seq::template apply>::type; +}; +template <> +struct make_seq<1> { + template + using apply = S1; +}; +template <> +struct make_seq<0> { + template + using apply = S0; +}; +} // namespace utility_detail + +// TODO: Remove after upgrading to C++14 baseline + +template +struct integer_sequence { + using value_type = T; + + static constexpr std::size_t size() noexcept { + return sizeof...(Ints); + } +}; + +template +using index_sequence = integer_sequence; + +template +using make_integer_sequence = typename utility_detail::make_seq< + Size>::template apply, integer_sequence>; + +template +using make_index_sequence = make_integer_sequence; +template +using index_sequence_for = make_index_sequence; + +/** + * A simple helper for getting a constant reference to an object. + * + * Example: + * + * std::vector v{1,2,3}; + * // The following two lines are equivalent: + * auto a = const_cast&>(v).begin(); + * auto b = folly::as_const(v).begin(); + * + * Like C++17's std::as_const. See http://wg21.link/p0007 + */ +template +T const& as_const(T& t) noexcept { + return t; +} + +template +void as_const(T const&&) = delete; + +} // namespace folly diff --git a/third-party/folly/folly/chrono/Hardware.h b/third-party/folly/folly/chrono/Hardware.h new file mode 100644 index 00000000000..ec7be82e8be --- /dev/null +++ b/third-party/folly/folly/chrono/Hardware.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include +#include + +#if _MSC_VER +extern "C" std::uint64_t __rdtsc(); +#pragma intrinsic(__rdtsc) +#endif + +namespace folly { + +inline std::uint64_t hardware_timestamp() { +#if _MSC_VER + return __rdtsc(); +#elif __GNUC__ && (__i386__ || FOLLY_X64) + return __builtin_ia32_rdtsc(); +#else + // use steady_clock::now() as an approximation for the timestamp counter on + // non-x86 systems + return std::chrono::steady_clock::now().time_since_epoch().count(); +#endif +} + +} // namespace folly + diff --git a/third-party/folly/folly/container/Array.h b/third-party/folly/folly/container/Array.h new file mode 100644 index 00000000000..bb3167b9793 --- /dev/null +++ b/third-party/folly/folly/container/Array.h @@ -0,0 +1,74 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include +#include + +namespace folly { + +namespace array_detail { +template +struct is_ref_wrapper : std::false_type {}; +template +struct is_ref_wrapper> : std::true_type {}; + +template +using not_ref_wrapper = + folly::Negation::type>>; + +template +struct return_type_helper { + using type = D; +}; +template +struct return_type_helper { + static_assert( + folly::Conjunction...>::value, + "TList cannot contain reference_wrappers when D is void"); + using type = typename std::common_type::type; +}; + +template +using return_type = std:: + array::type, sizeof...(TList)>; +} // namespace array_detail + +template +constexpr array_detail::return_type make_array(TList&&... t) { + using value_type = + typename array_detail::return_type_helper::type; + return {{static_cast(std::forward(t))...}}; +} + +namespace array_detail { +template +inline constexpr auto make_array_with( + MakeItem const& make, + folly::index_sequence) + -> std::array { + return std::array{{make(Index)...}}; +} +} // namespace array_detail + +// make_array_with +// +// Constructs a std::array<..., Size> with elements m(i) for i in [0, Size). +template +constexpr auto make_array_with(MakeItem const& make) + -> decltype(array_detail::make_array_with( + make, + folly::make_index_sequence{})) { + return array_detail::make_array_with( + make, + folly::make_index_sequence{}); +} + +} // namespace folly diff --git a/third-party/folly/folly/detail/Futex-inl.h b/third-party/folly/folly/detail/Futex-inl.h new file mode 100644 index 00000000000..3b2a412bfb6 --- /dev/null +++ b/third-party/folly/folly/detail/Futex-inl.h @@ -0,0 +1,117 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +namespace folly { +namespace detail { + +/** Optimal when TargetClock is the same type as Clock. + * + * Otherwise, both Clock::now() and TargetClock::now() must be invoked. */ +template +typename TargetClock::time_point time_point_conv( + std::chrono::time_point const& time) { + using std::chrono::duration_cast; + using TimePoint = std::chrono::time_point; + using TargetDuration = typename TargetClock::duration; + using TargetTimePoint = typename TargetClock::time_point; + if (time == TimePoint::max()) { + return TargetTimePoint::max(); + } else if (std::is_same::value) { + // in place of time_point_cast, which cannot compile without if-constexpr + auto const delta = time.time_since_epoch(); + return TargetTimePoint(duration_cast(delta)); + } else { + // different clocks with different epochs, so non-optimal case + auto const delta = time - Clock::now(); + return TargetClock::now() + duration_cast(delta); + } +} + +/** + * Available overloads, with definitions elsewhere + * + * These functions are treated as ADL-extension points, the templates above + * call these functions without them having being pre-declared. This works + * because ADL lookup finds the definitions of these functions when you pass + * the relevant arguments + */ +int futexWakeImpl( + const Futex* futex, + int count, + uint32_t wakeMask); +FutexResult futexWaitImpl( + const Futex* futex, + uint32_t expected, + std::chrono::system_clock::time_point const* absSystemTime, + std::chrono::steady_clock::time_point const* absSteadyTime, + uint32_t waitMask); + +int futexWakeImpl( + const Futex* futex, + int count, + uint32_t wakeMask); +FutexResult futexWaitImpl( + const Futex* futex, + uint32_t expected, + std::chrono::system_clock::time_point const* absSystemTime, + std::chrono::steady_clock::time_point const* absSteadyTime, + uint32_t waitMask); + +template +typename std::enable_if::type +futexWaitImpl( + Futex* futex, + uint32_t expected, + Deadline const& deadline, + uint32_t waitMask) { + return futexWaitImpl(futex, expected, nullptr, &deadline, waitMask); +} + +template +typename std::enable_if::type +futexWaitImpl( + Futex* futex, + uint32_t expected, + Deadline const& deadline, + uint32_t waitMask) { + return futexWaitImpl(futex, expected, &deadline, nullptr, waitMask); +} + +template +FutexResult +futexWait(const Futex* futex, uint32_t expected, uint32_t waitMask) { + auto rv = futexWaitImpl(futex, expected, nullptr, nullptr, waitMask); + assert(rv != FutexResult::TIMEDOUT); + return rv; +} + +template +int futexWake(const Futex* futex, int count, uint32_t wakeMask) { + return futexWakeImpl(futex, count, wakeMask); +} + +template +FutexResult futexWaitUntil( + const Futex* futex, + uint32_t expected, + std::chrono::time_point const& deadline, + uint32_t waitMask) { + using Target = typename std::conditional< + Clock::is_steady, + std::chrono::steady_clock, + std::chrono::system_clock>::type; + auto const converted = time_point_conv(deadline); + return converted == Target::time_point::max() + ? futexWaitImpl(futex, expected, nullptr, nullptr, waitMask) + : futexWaitImpl(futex, expected, converted, waitMask); +} + +} // namespace detail +} // namespace folly diff --git a/third-party/folly/folly/detail/Futex.cpp b/third-party/folly/folly/detail/Futex.cpp new file mode 100644 index 00000000000..62d6ea2b201 --- /dev/null +++ b/third-party/folly/folly/detail/Futex.cpp @@ -0,0 +1,263 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef __linux__ +#include +#endif + +#ifndef _WIN32 +#include +#endif + +using namespace std::chrono; + +namespace folly { +namespace detail { + +namespace { + +//////////////////////////////////////////////////// +// native implementation using the futex() syscall + +#ifdef __linux__ + +/// Certain toolchains (like Android's) don't include the full futex API in +/// their headers even though they support it. Make sure we have our constants +/// even if the headers don't have them. +#ifndef FUTEX_WAIT_BITSET +#define FUTEX_WAIT_BITSET 9 +#endif +#ifndef FUTEX_WAKE_BITSET +#define FUTEX_WAKE_BITSET 10 +#endif +#ifndef FUTEX_PRIVATE_FLAG +#define FUTEX_PRIVATE_FLAG 128 +#endif +#ifndef FUTEX_CLOCK_REALTIME +#define FUTEX_CLOCK_REALTIME 256 +#endif + +int nativeFutexWake(const void* addr, int count, uint32_t wakeMask) { + long rv = syscall( + __NR_futex, + addr, /* addr1 */ + FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG, /* op */ + count, /* val */ + nullptr, /* timeout */ + nullptr, /* addr2 */ + wakeMask); /* val3 */ + + /* NOTE: we ignore errors on wake for the case of a futex + guarding its own destruction, similar to this + glibc bug with sem_post/sem_wait: + https://sourceware.org/bugzilla/show_bug.cgi?id=12674 */ + if (rv < 0) { + return 0; + } + return static_cast(rv); +} + +template +struct timespec timeSpecFromTimePoint(time_point absTime) { + auto epoch = absTime.time_since_epoch(); + if (epoch.count() < 0) { + // kernel timespec_valid requires non-negative seconds and nanos in [0,1G) + epoch = Clock::duration::zero(); + } + + // timespec-safe seconds and nanoseconds; + // chrono::{nano,}seconds are `long long int` + // whereas timespec uses smaller types + using time_t_seconds = duration; + using long_nanos = duration; + + auto secs = duration_cast(epoch); + auto nanos = duration_cast(epoch - secs); + struct timespec result = {secs.count(), nanos.count()}; + return result; +} + +FutexResult nativeFutexWaitImpl( + const void* addr, + uint32_t expected, + system_clock::time_point const* absSystemTime, + steady_clock::time_point const* absSteadyTime, + uint32_t waitMask) { + assert(absSystemTime == nullptr || absSteadyTime == nullptr); + + int op = FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG; + struct timespec ts; + struct timespec* timeout = nullptr; + + if (absSystemTime != nullptr) { + op |= FUTEX_CLOCK_REALTIME; + ts = timeSpecFromTimePoint(*absSystemTime); + timeout = &ts; + } else if (absSteadyTime != nullptr) { + ts = timeSpecFromTimePoint(*absSteadyTime); + timeout = &ts; + } + + // Unlike FUTEX_WAIT, FUTEX_WAIT_BITSET requires an absolute timeout + // value - http://locklessinc.com/articles/futex_cheat_sheet/ + long rv = syscall( + __NR_futex, + addr, /* addr1 */ + op, /* op */ + expected, /* val */ + timeout, /* timeout */ + nullptr, /* addr2 */ + waitMask); /* val3 */ + + if (rv == 0) { + return FutexResult::AWOKEN; + } else { + switch (errno) { + case ETIMEDOUT: + assert(timeout != nullptr); + return FutexResult::TIMEDOUT; + case EINTR: + return FutexResult::INTERRUPTED; + case EWOULDBLOCK: + return FutexResult::VALUE_CHANGED; + default: + assert(false); + // EINVAL, EACCESS, or EFAULT. EINVAL means there was an invalid + // op (should be impossible) or an invalid timeout (should have + // been sanitized by timeSpecFromTimePoint). EACCESS or EFAULT + // means *addr points to invalid memory, which is unlikely because + // the caller should have segfaulted already. We can either + // crash, or return a value that lets the process continue for + // a bit. We choose the latter. VALUE_CHANGED probably turns the + // caller into a spin lock. + return FutexResult::VALUE_CHANGED; + } + } +} + +#endif // __linux__ + +/////////////////////////////////////////////////////// +// compatibility implementation using standard C++ API + +using Lot = ParkingLot; +Lot parkingLot; + +int emulatedFutexWake(const void* addr, int count, uint32_t waitMask) { + int woken = 0; + parkingLot.unpark(addr, [&](const uint32_t& mask) { + if ((mask & waitMask) == 0) { + return UnparkControl::RetainContinue; + } + assert(count > 0); + count--; + woken++; + return count > 0 ? UnparkControl::RemoveContinue + : UnparkControl::RemoveBreak; + }); + return woken; +} + +template +FutexResult emulatedFutexWaitImpl( + F* futex, + uint32_t expected, + system_clock::time_point const* absSystemTime, + steady_clock::time_point const* absSteadyTime, + uint32_t waitMask) { + static_assert( + std::is_same>::value || + std::is_same>::value, + "Type F must be either Futex or Futex"); + ParkResult res; + if (absSystemTime) { + res = parkingLot.park_until( + futex, + waitMask, + [&] { return *futex == expected; }, + [] {}, + *absSystemTime); + } else if (absSteadyTime) { + res = parkingLot.park_until( + futex, + waitMask, + [&] { return *futex == expected; }, + [] {}, + *absSteadyTime); + } else { + res = parkingLot.park( + futex, waitMask, [&] { return *futex == expected; }, [] {}); + } + switch (res) { + case ParkResult::Skip: + return FutexResult::VALUE_CHANGED; + case ParkResult::Unpark: + return FutexResult::AWOKEN; + case ParkResult::Timeout: + return FutexResult::TIMEDOUT; + } + + return FutexResult::INTERRUPTED; +} + +} // namespace + +///////////////////////////////// +// Futex<> overloads + +int futexWakeImpl( + const Futex* futex, + int count, + uint32_t wakeMask) { +#ifdef __linux__ + return nativeFutexWake(futex, count, wakeMask); +#else + return emulatedFutexWake(futex, count, wakeMask); +#endif +} + +int futexWakeImpl( + const Futex* futex, + int count, + uint32_t wakeMask) { + return emulatedFutexWake(futex, count, wakeMask); +} + +FutexResult futexWaitImpl( + const Futex* futex, + uint32_t expected, + system_clock::time_point const* absSystemTime, + steady_clock::time_point const* absSteadyTime, + uint32_t waitMask) { +#ifdef __linux__ + return nativeFutexWaitImpl( + futex, expected, absSystemTime, absSteadyTime, waitMask); +#else + return emulatedFutexWaitImpl( + futex, expected, absSystemTime, absSteadyTime, waitMask); +#endif +} + +FutexResult futexWaitImpl( + const Futex* futex, + uint32_t expected, + system_clock::time_point const* absSystemTime, + steady_clock::time_point const* absSteadyTime, + uint32_t waitMask) { + return emulatedFutexWaitImpl( + futex, expected, absSystemTime, absSteadyTime, waitMask); +} + +} // namespace detail +} // namespace folly diff --git a/third-party/folly/folly/detail/Futex.h b/third-party/folly/folly/detail/Futex.h new file mode 100644 index 00000000000..987a1b89574 --- /dev/null +++ b/third-party/folly/folly/detail/Futex.h @@ -0,0 +1,96 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace folly { +namespace detail { + +enum class FutexResult { + VALUE_CHANGED, /* futex value didn't match expected */ + AWOKEN, /* wakeup by matching futex wake, or spurious wakeup */ + INTERRUPTED, /* wakeup by interrupting signal */ + TIMEDOUT, /* wakeup by expiring deadline */ +}; + +/** + * Futex is an atomic 32 bit unsigned integer that provides access to the + * futex() syscall on that value. It is templated in such a way that it + * can interact properly with DeterministicSchedule testing. + * + * If you don't know how to use futex(), you probably shouldn't be using + * this class. Even if you do know how, you should have a good reason + * (and benchmarks to back you up). + * + * Because of the semantics of the futex syscall, the futex family of + * functions are available as free functions rather than member functions + */ +template