diff --git a/CMakeLists.txt b/CMakeLists.txt index d2e12b1e917..41ecf112d94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -666,7 +666,29 @@ set(SOURCES utilities/ttl/db_ttl_impl.cc utilities/write_batch_with_index/write_batch_with_index.cc utilities/write_batch_with_index/write_batch_with_index_internal.cc - $) + utilities/titandb/blob_file_builder.cc + utilities/titandb/blob_file_cache.cc + utilities/titandb/blob_file_iterator.cc + utilities/titandb/blob_file_reader.cc + utilities/titandb/blob_file_size_collector.cc + utilities/titandb/blob_format.cc + utilities/titandb/blob_gc.cc + utilities/titandb/blob_gc_job.cc + utilities/titandb/blob_gc_picker.cc + utilities/titandb/db.cc + utilities/titandb/db_impl.cc + utilities/titandb/db_impl_files.cc + utilities/titandb/db_impl_gc.cc + utilities/titandb/options.cc + utilities/titandb/table_builder.cc + utilities/titandb/table_factory.cc + utilities/titandb/util.cc + utilities/titandb/version.cc + utilities/titandb/version_builder.cc + utilities/titandb/version_edit.cc + utilities/titandb/version_set.cc + $ + ) if(HAVE_SSE42 AND NOT MSVC) set_source_files_properties( @@ -965,6 +987,16 @@ if(WITH_TESTS) utilities/transactions/write_unprepared_transaction_test.cc utilities/ttl/ttl_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc + utilities/titandb/blob_file_iterator_test.cc + utilities/titandb/blob_file_size_collector_test.cc + utilities/titandb/blob_file_test.cc + utilities/titandb/blob_format_test.cc + utilities/titandb/blob_gc_job_test.cc + utilities/titandb/blob_gc_picker_test.cc + utilities/titandb/table_builder_test.cc + utilities/titandb/titan_db_test.cc + utilities/titandb/util_test.cc + utilities/titandb/version_test.cc ) if(WITH_LIBRADOS) list(APPEND TESTS utilities/env_librados_test.cc) diff --git a/Makefile b/Makefile index 9d1fcdd5c80..eac01c8f384 100644 --- a/Makefile +++ b/Makefile @@ -1006,7 +1006,7 @@ rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc clean: - rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED) + rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED) $(TITANDB_TESTS) rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report $(FIND) . -name "*.[oda]" -exec rm -f {} \; $(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; @@ -1530,6 +1530,64 @@ range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LI blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +TITANDB_TESTS = \ + titandb_blob_file_iterator_test \ + titandb_blob_file_size_collector_test \ + titandb_blob_file_test \ + titandb_blob_format_test \ + titandb_blob_gc_job_test \ + titandb_blob_gc_picker_test \ + titandb_db_test \ + titandb_table_builder_test \ + titandb_util_test \ + titandb_version_test \ + +titandb_blob_file_iterator_test: utilities/titandb/blob_file_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_blob_file_size_collector_test: utilities/titandb/blob_file_size_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_blob_file_test: utilities/titandb/blob_file_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_blob_format_test: utilities/titandb/blob_format_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_blob_gc_job_test: utilities/titandb/blob_gc_job_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_blob_gc_picker_test: utilities/titandb/blob_gc_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_db_test: utilities/titandb/titan_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_table_builder_test: utilities/titandb/table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_util_test: utilities/titandb/util_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_version_test: utilities/titandb/version_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +titandb_check: $(TITANDB_TESTS) + for t in $(TITANDB_TESTS); \ + do \ + echo "======== Running $$t ========"; \ + ./$$t || exit 1; \ + done; + +titandb_valgrind_check: $(TITANDB_TESTS) + for t in $(TITANDB_TESTS); do \ + $(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \ + code=$$?; \ + if [ $$code -ne 0 ]; then \ + exit $$code; \ + fi; \ + done; + #------------------------------------------------- # make install related stuff INSTALL_PATH ?= /usr/local diff --git a/src.mk b/src.mk index 4b6bb170f9f..22d1827047a 100644 --- a/src.mk +++ b/src.mk @@ -213,6 +213,27 @@ LIB_SOURCES = \ utilities/ttl/db_ttl_impl.cc \ utilities/write_batch_with_index/write_batch_with_index.cc \ utilities/write_batch_with_index/write_batch_with_index_internal.cc \ + utilities/titandb/blob_file_builder.cc \ + utilities/titandb/blob_file_cache.cc \ + utilities/titandb/blob_file_iterator.cc \ + utilities/titandb/blob_file_reader.cc \ + utilities/titandb/blob_file_size_collector.cc \ + utilities/titandb/blob_format.cc \ + utilities/titandb/blob_gc.cc \ + utilities/titandb/blob_gc_job.cc \ + utilities/titandb/blob_gc_picker.cc \ + utilities/titandb/db.cc \ + utilities/titandb/db_impl.cc \ + utilities/titandb/db_impl_files.cc \ + utilities/titandb/db_impl_gc.cc \ + utilities/titandb/options.cc \ + utilities/titandb/table_builder.cc \ + utilities/titandb/table_factory.cc \ + utilities/titandb/util.cc \ + utilities/titandb/version.cc \ + utilities/titandb/version_builder.cc \ + utilities/titandb/version_edit.cc \ + utilities/titandb/version_set.cc \ ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1)) LIB_SOURCES_ASM =\ diff --git a/utilities/titandb/blob_file_builder.cc b/utilities/titandb/blob_file_builder.cc new file mode 100644 index 00000000000..e6be9d9b840 --- /dev/null +++ b/utilities/titandb/blob_file_builder.cc @@ -0,0 +1,57 @@ +#include "utilities/titandb/blob_file_builder.h" + +#include "util/crc32c.h" +#include "utilities/titandb/util.h" + +namespace rocksdb { +namespace titandb { + +// HEADER: 8 bytes length +// BODY: variable length +// TAIL: 5 bytes length +void BlobFileBuilder::Add(const BlobRecord& record, BlobHandle* handle) { + if (!ok()) return; + + buffer_.clear(); + assert(!record.key.empty()); + assert(!record.value.empty()); + record.EncodeTo(&buffer_); + + CompressionType compression = options_.blob_file_compression; + auto output = Compress(&compression, buffer_, &compressed_buffer_); + + uint64_t body_length = output.size(); + status_ = file_->Append( + Slice{reinterpret_cast(&body_length), kBlobHeaderSize}); + if (!ok()) return; + + handle->offset = file_->GetFileSize(); + handle->size = output.size(); + + status_ = file_->Append(output); + if (ok()) { + char tailer[kBlobTailerSize]; + tailer[0] = compression; + EncodeFixed32(tailer+1, crc32c::Value(output.data(), output.size())); + status_ = file_->Append(Slice(tailer, sizeof(tailer))); + } +} + +Status BlobFileBuilder::Finish() { + if (!ok()) return status(); + + BlobFileFooter footer; + buffer_.clear(); + footer.EncodeTo(&buffer_); + + status_ = file_->Append(buffer_); + if (ok()) { + status_ = file_->Flush(); + } + return status(); +} + +void BlobFileBuilder::Abandon() {} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_builder.h b/utilities/titandb/blob_file_builder.h new file mode 100644 index 00000000000..d009b11914b --- /dev/null +++ b/utilities/titandb/blob_file_builder.h @@ -0,0 +1,69 @@ +#pragma once + +#include "util/file_reader_writer.h" +#include "utilities/titandb/blob_format.h" +#include "utilities/titandb/options.h" + +namespace rocksdb { +namespace titandb { + +// Blob file format: +// +// +// [blob record 1] +// [blob record 2] +// ... +// [blob record N] +// [meta block 1] +// [meta block 2] +// ... +// [meta block K] +// [meta index block] +// [footer] +// +// +// 1. The sequence of blob records in the file are stored in sorted +// order. These records come one after another at the beginning of the +// file, and are compressed according to the compression options. +// +// 2. After the blob records we store a bunch of meta blocks, and a +// meta index block with block handles pointed to the meta blocks. The +// meta block and the meta index block are formatted the same as the +// BlockBasedTable. + +class BlobFileBuilder { + public: + // Constructs a builder that will store the contents of the file it + // is building in "*file". Does not close the file. It is up to the + // caller to sync and close the file after calling Finish(). + BlobFileBuilder(const TitanCFOptions& options, WritableFileWriter* file) + : options_(options), file_(file) {} + + // Adds the record to the file and points the handle to it. + void Add(const BlobRecord& record, BlobHandle* handle); + + // Returns non-ok iff some error has been detected. + Status status() const { return status_; } + + // Finishes building the table. + // REQUIRES: Finish(), Abandon() have not been called. + Status Finish(); + + // Abandons building the table. If the caller is not going to call + // Finish(), it must call Abandon() before destroying this builder. + // REQUIRES: Finish(), Abandon() have not been called. + void Abandon(); + + private: + bool ok() const { return status().ok(); } + + TitanCFOptions options_; + WritableFileWriter* file_; + + Status status_; + std::string buffer_; + std::string compressed_buffer_; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_cache.cc b/utilities/titandb/blob_file_cache.cc new file mode 100644 index 00000000000..8647bca1019 --- /dev/null +++ b/utilities/titandb/blob_file_cache.cc @@ -0,0 +1,89 @@ +#include "utilities/titandb/blob_file_cache.h" + +#include "util/filename.h" +#include "util/file_reader_writer.h" +#include "utilities/titandb/util.h" + +namespace rocksdb { +namespace titandb { + +namespace { + +Slice EncodeFileNumber(const uint64_t* number) { + return Slice(reinterpret_cast(number), sizeof(*number)); +} + +} // namespace + +BlobFileCache::BlobFileCache(const TitanDBOptions& db_options, + const TitanCFOptions& cf_options, + std::shared_ptr cache) + : env_(db_options.env), + env_options_(db_options), + db_options_(db_options), + cf_options_(cf_options), + cache_(cache) {} + +Status BlobFileCache::Get(const ReadOptions& options, uint64_t file_number, + uint64_t file_size, const BlobHandle& handle, + BlobRecord* record, PinnableSlice* buffer) { + Cache::Handle* cache_handle = nullptr; + Status s = FindFile(file_number, file_size, &cache_handle); + if (!s.ok()) return s; + + auto reader = reinterpret_cast(cache_->Value(cache_handle)); + s = reader->Get(options, handle, record, buffer); + cache_->Release(cache_handle); + return s; +} + +Status BlobFileCache::NewPrefetcher( + uint64_t file_number, + uint64_t file_size, + std::unique_ptr* result) { + Cache::Handle* cache_handle = nullptr; + Status s = FindFile(file_number, file_size, &cache_handle); + if (!s.ok()) return s; + + auto reader = reinterpret_cast(cache_->Value(cache_handle)); + auto prefetcher = new BlobFilePrefetcher(reader); + prefetcher->RegisterCleanup(&UnrefCacheHandle, cache_.get(), cache_handle); + result->reset(prefetcher); + return s; +} + +void BlobFileCache::Evict(uint64_t file_number) { + cache_->Erase(EncodeFileNumber(&file_number)); +} + +Status BlobFileCache::FindFile(uint64_t file_number, + uint64_t file_size, + Cache::Handle** handle) { + Status s; + Slice cache_key = EncodeFileNumber(&file_number); + *handle = cache_->Lookup(cache_key); + if (*handle) return s; + + std::unique_ptr file; + { + std::unique_ptr f; + auto file_name = BlobFileName(db_options_.dirname, file_number); + s = env_->NewRandomAccessFile(file_name, &f, env_options_); + if (!s.ok()) return s; + if (db_options_.advise_random_on_open) { + f->Hint(RandomAccessFile::RANDOM); + } + file.reset(new RandomAccessFileReader(std::move(f), file_name)); + } + + std::unique_ptr reader; + s = BlobFileReader::Open(cf_options_, std::move(file), file_size, &reader); + if (!s.ok()) return s; + + cache_->Insert(cache_key, reader.release(), 1, + &DeleteCacheValue, handle); + return s; +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_cache.h b/utilities/titandb/blob_file_cache.h new file mode 100644 index 00000000000..c74f9194125 --- /dev/null +++ b/utilities/titandb/blob_file_cache.h @@ -0,0 +1,50 @@ +#pragma once + +#include "rocksdb/options.h" +#include "utilities/titandb/blob_file_reader.h" +#include "utilities/titandb/blob_format.h" +#include "utilities/titandb/options.h" + +namespace rocksdb { +namespace titandb { + +class BlobFileCache { + public: + // Constructs a blob file cache to cache opened files. + BlobFileCache(const TitanDBOptions& db_options, + const TitanCFOptions& cf_options, std::shared_ptr cache); + + // Gets the blob record pointed by the handle in the specified file + // number. The corresponding file size must be exactly "file_size" + // bytes. The provided buffer is used to store the record data, so + // the buffer must be valid when the record is used. + Status Get(const ReadOptions& options, + uint64_t file_number, + uint64_t file_size, + const BlobHandle& handle, + BlobRecord* record, PinnableSlice* buffer); + + // Creates a prefetcher for the specified file number. + Status NewPrefetcher(uint64_t file_number, uint64_t file_size, + std::unique_ptr* result); + + // Evicts the file cache for the specified file number. + void Evict(uint64_t file_number); + + private: + // Finds the file for the specified file number. Opens the file if + // the file is not found in the cache and caches it. + // If successful, sets "*handle" to the cached file. + Status FindFile(uint64_t file_number, + uint64_t file_size, + Cache::Handle** handle); + + Env* env_; + EnvOptions env_options_; + TitanDBOptions db_options_; + TitanCFOptions cf_options_; + std::shared_ptr cache_; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_iterator.cc b/utilities/titandb/blob_file_iterator.cc new file mode 100644 index 00000000000..666b0287d23 --- /dev/null +++ b/utilities/titandb/blob_file_iterator.cc @@ -0,0 +1,193 @@ +#include "utilities/titandb/blob_file_iterator.h" + +#include "util/crc32c.h" +#include "utilities/titandb/util.h" + +namespace rocksdb { +namespace titandb { + +BlobFileIterator::BlobFileIterator( + std::unique_ptr&& file, uint64_t file_name, + uint64_t file_size, const TitanCFOptions& titan_cf_options) + : file_(std::move(file)), + file_number_(file_name), + file_size_(file_size), + titan_cf_options_(titan_cf_options) {} + +BlobFileIterator::~BlobFileIterator() {} + +bool BlobFileIterator::Init() { + char buf[BlobFileFooter::kEncodedLength]; + Slice slice; + status_ = file_->Read(file_size_ - BlobFileFooter::kEncodedLength, + BlobFileFooter::kEncodedLength, &slice, buf); + if (!status_.ok()) return false; + BlobFileFooter blob_file_footer; + blob_file_footer.DecodeFrom(&slice); + total_blocks_size_ = file_size_ - BlobFileFooter::kEncodedLength - + blob_file_footer.meta_index_handle.size(); + assert(total_blocks_size_ > 0); + init_ = true; + return true; +} + +void BlobFileIterator::SeekToFirst() { + if (!init_) Init(); + iterate_offset_ = 0; + PrefetchAndGet(); +} + +bool BlobFileIterator::Valid() const { return valid_; } + +void BlobFileIterator::Next() { + assert(init_); + PrefetchAndGet(); +} + +Slice BlobFileIterator::key() const { return cur_blob_record_.key; } + +Slice BlobFileIterator::value() const { return cur_blob_record_.value; } + +void BlobFileIterator::IterateForPrev(uint64_t offset) { + if (!init_) Init(); + + if (offset >= total_blocks_size_) { + iterate_offset_ = offset; + status_ = Status::InvalidArgument("Out of bound"); + return; + } + + Slice slice; + uint64_t body_length; + uint64_t total_length; + for (iterate_offset_ = 0; iterate_offset_ < offset; + iterate_offset_ += total_length) { + Status s = file_->Read(iterate_offset_, kBlobHeaderSize, &slice, + reinterpret_cast(&body_length)); + if (!s.ok()) { + status_ = s; + return; + } + total_length = kBlobHeaderSize + body_length + kBlobTailerSize; + } + + if (iterate_offset_ > offset) iterate_offset_ -= total_length; + valid_ = false; +} + +void BlobFileIterator::GetBlobRecord() { + // read header + Slice slice; + uint64_t body_length; + status_ = file_->Read(iterate_offset_, kBlobHeaderSize, &slice, + reinterpret_cast(&body_length)); + if (!status_.ok()) return; + body_length = *reinterpret_cast(slice.data()); + assert(body_length > 0); + iterate_offset_ += kBlobHeaderSize; + + // read body and tailer + uint64_t left_size = body_length + kBlobTailerSize; + buffer_.reserve(left_size); + status_ = file_->Read(iterate_offset_, left_size, &slice, buffer_.data()); + if (!status_.ok()) return; + + // parse body and tailer + auto tailer = buffer_.data() + body_length; + auto checksum = DecodeFixed32(tailer + 1); + if (crc32c::Value(buffer_.data(), body_length) != checksum) { + status_ = Status::Corruption("BlobRecord", "checksum"); + return; + } + auto compression = static_cast(*tailer); + std::unique_ptr uncompressed; + if (compression == kNoCompression) { + slice = {buffer_.data(), body_length}; + } else { + status_ = Uncompress(compression, {buffer_.data(), body_length}, &slice, + &uncompressed); + if (!status_.ok()) return; + } + status_ = DecodeInto(slice, &cur_blob_record_); + if (!status_.ok()) return; + + cur_record_offset_ = iterate_offset_; + cur_record_size_ = body_length; + iterate_offset_ += left_size; + valid_ = true; +} + +void BlobFileIterator::PrefetchAndGet() { + if (iterate_offset_ >= total_blocks_size_) { + valid_ = false; + return; + } + + if (readahead_begin_offset_ > iterate_offset_ || + readahead_end_offset_ < iterate_offset_) { + // alignment + readahead_begin_offset_ = + iterate_offset_ - (iterate_offset_ & (kDefaultPageSize - 1)); + readahead_end_offset_ = readahead_begin_offset_; + readahead_size_ = kMinReadaheadSize; + } + auto min_blob_size = + iterate_offset_ + kBlobFixedSize + titan_cf_options_.min_blob_size; + if (readahead_end_offset_ <= min_blob_size) { + while (readahead_end_offset_ + readahead_size_ <= min_blob_size && + readahead_size_ < kMaxReadaheadSize) + readahead_size_ <<= 1; + file_->Prefetch(readahead_end_offset_, readahead_size_); + readahead_end_offset_ += readahead_size_; + readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ << 1); + } + + GetBlobRecord(); + + if (readahead_end_offset_ < iterate_offset_) { + readahead_end_offset_ = iterate_offset_; + } +} + +BlobFileMergeIterator::BlobFileMergeIterator( + std::vector>&& blob_file_iterators) + : blob_file_iterators_(std::move(blob_file_iterators)) {} + +bool BlobFileMergeIterator::Valid() const { + if (current_ == nullptr) return false; + return current_->Valid(); +} + +void BlobFileMergeIterator::SeekToFirst() { + for (auto& iter : blob_file_iterators_) { + iter->SeekToFirst(); + if (iter->status().ok() && iter->Valid()) min_heap_.push(iter.get()); + } + if (!min_heap_.empty()) { + current_ = min_heap_.top(); + min_heap_.pop(); + } else { + status_ = Status::Aborted("No iterator is valid"); + } +} + +void BlobFileMergeIterator::Next() { + assert(current_ != nullptr); + current_->Next(); + if (current_->status().ok() && current_->Valid()) min_heap_.push(current_); + current_ = min_heap_.top(); + min_heap_.pop(); +} + +Slice BlobFileMergeIterator::key() const { + assert(current_ != nullptr); + return current_->key(); +} + +Slice BlobFileMergeIterator::value() const { + assert(current_ != nullptr); + return current_->value(); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_iterator.h b/utilities/titandb/blob_file_iterator.h new file mode 100644 index 00000000000..324d8ac35fb --- /dev/null +++ b/utilities/titandb/blob_file_iterator.h @@ -0,0 +1,114 @@ +#ifndef ROCKSDB_BLOB_FILE_ITERATOR_H +#define ROCKSDB_BLOB_FILE_ITERATOR_H + +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "table/internal_iterator.h" +#include "util/file_reader_writer.h" +#include "utilities/titandb/blob_format.h" +#include "utilities/titandb/options.h" + +namespace rocksdb { +namespace titandb { + +class BlobFileIterator { + public: + const uint64_t kMinReadaheadSize = 4 << 10; + const uint64_t kMaxReadaheadSize = 256 << 10; + + BlobFileIterator(std::unique_ptr&& file, + uint64_t file_name, uint64_t file_size, + const TitanCFOptions& titan_cf_options); + ~BlobFileIterator(); + + bool Init(); + bool Valid() const; + void SeekToFirst(); + void Next(); + Slice key() const; + Slice value() const; + Status status() const { return status_; } + + void IterateForPrev(uint64_t); + + BlobIndex GetBlobIndex() { + BlobIndex blob_index; + blob_index.file_number = file_number_; + blob_index.blob_handle.offset = cur_record_offset_; + blob_index.blob_handle.size = cur_record_size_; + return blob_index; + } + + private: + // Blob file info + const std::unique_ptr file_; + const uint64_t file_number_; + const uint64_t file_size_; + TitanCFOptions titan_cf_options_; + + bool init_{false}; + uint64_t total_blocks_size_{0}; + + // Iterator status + Status status_; + bool valid_{false}; + + uint64_t iterate_offset_{0}; + std::vector buffer_; + BlobRecord cur_blob_record_; + uint64_t cur_record_offset_; + uint64_t cur_record_size_; + + uint64_t readahead_begin_offset_{0}; + uint64_t readahead_end_offset_{0}; + uint64_t readahead_size_{kMinReadaheadSize}; + + void PrefetchAndGet(); + void GetBlobRecord(); +}; + +class BlobFileMergeIterator { + public: + explicit BlobFileMergeIterator( + std::vector>&&); + + ~BlobFileMergeIterator() = default; + + bool Valid() const; + void SeekToFirst(); + void Next(); + Slice key() const; + Slice value() const; + Status status() const { + if (current_ != nullptr && !current_->status().ok()) + return current_->status(); + return status_; + } + + BlobIndex GetBlobIndex() { return current_->GetBlobIndex(); } + + private: + class IternalComparator { + public: + // Smaller value get Higher priority + bool operator()(const BlobFileIterator* iter1, + const BlobFileIterator* iter2) { + return BytewiseComparator()->Compare(iter1->key(), iter2->key()) > 0; + } + }; + + Status status_; + std::vector> blob_file_iterators_; + std::priority_queue, + IternalComparator> + min_heap_; + BlobFileIterator* current_ = nullptr; +}; + +} // namespace titandb +} // namespace rocksdb + +#endif // ROCKSDB_BLOB_FILE_ITERATOR_H diff --git a/utilities/titandb/blob_file_iterator_test.cc b/utilities/titandb/blob_file_iterator_test.cc new file mode 100644 index 00000000000..e535f5d75d4 --- /dev/null +++ b/utilities/titandb/blob_file_iterator_test.cc @@ -0,0 +1,225 @@ +#include "utilities/titandb/blob_file_iterator.h" + +#include + +#include "util/filename.h" +#include "util/testharness.h" +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_cache.h" +#include "utilities/titandb/blob_file_reader.h" + +namespace rocksdb { +namespace titandb { + +class BlobFileIteratorTest : public testing::Test { + public: + Env* env_{Env::Default()}; + TitanOptions titan_options_; + EnvOptions env_options_; + std::string dirname_; + std::string file_name_; + uint64_t file_number_; + std::unique_ptr builder_; + std::unique_ptr writable_file_; + std::unique_ptr blob_file_iterator_; + std::unique_ptr readable_file_; + + BlobFileIteratorTest() : dirname_(test::TmpDir(env_)) { + titan_options_.dirname = dirname_; + file_number_ = Random::GetTLSInstance()->Next(); + file_name_ = BlobFileName(dirname_, file_number_); + } + + ~BlobFileIteratorTest() { + env_->DeleteFile(file_name_); + env_->DeleteDir(dirname_); + } + + std::string GenKey(uint64_t i) { + char buf[64]; + snprintf(buf, sizeof(buf), "k-%08" PRIu64, i); + return buf; + } + + std::string GenValue(uint64_t k) { + if (k % 2 == 0) { + return std::string(titan_options_.min_blob_size - 1, 'v'); + } else { + return std::string(titan_options_.min_blob_size + 1, 'v'); + } + } + + void NewBuiler() { + TitanDBOptions db_options(titan_options_); + TitanCFOptions cf_options(titan_options_); + BlobFileCache cache(db_options, cf_options, {NewLRUCache(128)}); + + { + std::unique_ptr f; + ASSERT_OK(env_->NewWritableFile(file_name_, &f, env_options_)); + writable_file_.reset(new WritableFileWriter(std::move(f), env_options_)); + } + builder_.reset(new BlobFileBuilder(cf_options, writable_file_.get())); + } + + void AddKeyValue(const std::string& key, const std::string& value, + BlobHandle* blob_handle) { + BlobRecord record; + record.key = key; + record.value = value; + builder_->Add(record, blob_handle); + ASSERT_OK(builder_->status()); + } + + void FinishBuiler() { + ASSERT_OK(builder_->Finish()); + ASSERT_OK(builder_->status()); + } + + void NewBlobFileIterator() { + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file_name_, &file_size)); + NewBlobFileReader(file_number_, 0, titan_options_, env_options_, env_, + &readable_file_); + blob_file_iterator_.reset(new BlobFileIterator{ + std::move(readable_file_), file_number_, file_size, TitanCFOptions()}); + } + + void TestBlobFileIterator() { + NewBuiler(); + + const int n = 1000; + std::vector handles(n); + for (int i = 0; i < n; i++) { + auto id = std::to_string(i); + AddKeyValue(id, id, &handles[i]); + } + + FinishBuiler(); + + NewBlobFileIterator(); + + blob_file_iterator_->SeekToFirst(); + for (int i = 0; i < n; blob_file_iterator_->Next(), i++) { + ASSERT_OK(blob_file_iterator_->status()); + ASSERT_EQ(blob_file_iterator_->Valid(), true); + auto id = std::to_string(i); + ASSERT_EQ(id, blob_file_iterator_->key()); + ASSERT_EQ(id, blob_file_iterator_->value()); + BlobIndex blob_index = blob_file_iterator_->GetBlobIndex(); + ASSERT_EQ(handles[i], blob_index.blob_handle); + } + } +}; + +TEST_F(BlobFileIteratorTest, Basic) { + TitanOptions options; + TestBlobFileIterator(); +} + +TEST_F(BlobFileIteratorTest, IterateForPrev) { + NewBuiler(); + const int n = 1000; + std::vector handles(n); + for (int i = 0; i < n; i++) { + auto id = std::to_string(i); + AddKeyValue(id, id, &handles[i]); + } + + FinishBuiler(); + + NewBlobFileIterator(); + + int i = n / 2; + blob_file_iterator_->IterateForPrev(handles[i].offset); + ASSERT_OK(blob_file_iterator_->status()); + for (blob_file_iterator_->Next(); i < n; i++, blob_file_iterator_->Next()) { + ASSERT_OK(blob_file_iterator_->status()); + ASSERT_EQ(blob_file_iterator_->Valid(), true); + BlobIndex blob_index; + blob_index = blob_file_iterator_->GetBlobIndex(); + ASSERT_EQ(handles[i], blob_index.blob_handle); + auto id = std::to_string(i); + ASSERT_EQ(id, blob_file_iterator_->key()); + ASSERT_EQ(id, blob_file_iterator_->value()); + } + + auto idx = Random::GetTLSInstance()->Uniform(n); + blob_file_iterator_->IterateForPrev(handles[idx].offset); + ASSERT_OK(blob_file_iterator_->status()); + blob_file_iterator_->Next(); + ASSERT_OK(blob_file_iterator_->status()); + ASSERT_TRUE(blob_file_iterator_->Valid()); + BlobIndex blob_index; + blob_index = blob_file_iterator_->GetBlobIndex(); + ASSERT_EQ(handles[idx], blob_index.blob_handle); + + while ((idx = Random::GetTLSInstance()->Uniform(n)) == 0) + ; + blob_file_iterator_->IterateForPrev(handles[idx].offset - kBlobHeaderSize - + 1); + ASSERT_OK(blob_file_iterator_->status()); + blob_file_iterator_->Next(); + ASSERT_OK(blob_file_iterator_->status()); + ASSERT_TRUE(blob_file_iterator_->Valid()); + blob_index = blob_file_iterator_->GetBlobIndex(); + ASSERT_EQ(handles[idx - 1], blob_index.blob_handle); + + idx = Random::GetTLSInstance()->Uniform(n); + blob_file_iterator_->IterateForPrev(handles[idx].offset + 1); + ASSERT_OK(blob_file_iterator_->status()); + blob_file_iterator_->Next(); + ASSERT_OK(blob_file_iterator_->status()); + ASSERT_TRUE(blob_file_iterator_->Valid()); + blob_index = blob_file_iterator_->GetBlobIndex(); + ASSERT_EQ(handles[idx], blob_index.blob_handle); +} + +TEST_F(BlobFileIteratorTest, MergeIterator) { + const int kMaxKeyNum = 1000; + std::vector handles(kMaxKeyNum); + std::vector> iters; + NewBuiler(); + for (int i = 1; i < kMaxKeyNum; i++) { + AddKeyValue(GenKey(i), GenValue(i), &handles[i]); + if (i % 100 == 0) { + FinishBuiler(); + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file_name_, &file_size)); + NewBlobFileReader(file_number_, 0, titan_options_, env_options_, env_, + &readable_file_); + iters.emplace_back(std::unique_ptr( + new BlobFileIterator{std::move(readable_file_), file_number_, + file_size, TitanCFOptions()})); + file_number_ = Random::GetTLSInstance()->Next(); + file_name_ = BlobFileName(dirname_, file_number_); + NewBuiler(); + } + } + + FinishBuiler(); + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file_name_, &file_size)); + NewBlobFileReader(file_number_, 0, titan_options_, env_options_, env_, + &readable_file_); + iters.emplace_back(std::unique_ptr(new BlobFileIterator{ + std::move(readable_file_), file_number_, file_size, TitanCFOptions()})); + BlobFileMergeIterator iter(std::move(iters)); + + iter.SeekToFirst(); + for (int i = 1; i < kMaxKeyNum; i++, iter.Next()) { + ASSERT_OK(iter.status()); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(iter.key(), GenKey(i)); + ASSERT_EQ(iter.value(), GenValue(i)); + ASSERT_EQ(iter.GetBlobIndex().blob_handle, handles[i]); + } +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/blob_file_manager.h b/utilities/titandb/blob_file_manager.h new file mode 100644 index 00000000000..f5664afb9a8 --- /dev/null +++ b/utilities/titandb/blob_file_manager.h @@ -0,0 +1,71 @@ +#pragma once + +#include "util/file_reader_writer.h" +#include "utilities/titandb/blob_format.h" + +namespace rocksdb { +namespace titandb { + +// Contains information to complete a blob file creation. +class BlobFileHandle { + public: + virtual ~BlobFileHandle() {} + + virtual uint64_t GetNumber() const = 0; + + virtual const std::string& GetName() const = 0; + + virtual WritableFileWriter* GetFile() const = 0; +}; + +// Manages the process of blob files creation. +class BlobFileManager { + public: + virtual ~BlobFileManager() {} + + // Creates a new file. The new file should not be accessed until + // FinishFile() has been called. + // If successful, sets "*handle* to the new file handle. + virtual Status NewFile(std::unique_ptr* handle) = 0; + + // Finishes the file with the provided metadata. Stops writting to + // the file anymore. + // REQUIRES: FinishFile(), DeleteFile() have not been called. + virtual Status FinishFile(uint32_t cf_id, std::shared_ptr file, + std::unique_ptr&& handle) { + std::vector, + std::unique_ptr>> + tmp; + tmp.emplace_back(std::make_pair(file, std::move(handle))); + return BatchFinishFiles(cf_id, tmp); + } + + // Batch version of FinishFile + virtual Status BatchFinishFiles( + uint32_t cf_id, + const std::vector, + std::unique_ptr>>& files) { + (void)cf_id; + (void)files; + return Status::OK(); + }; + + // Deletes the file. If the caller is not going to call + // FinishFile(), it must call DeleteFile() to release the handle. + // REQUIRES: FinishFile(), DeleteFile() have not been called. + virtual Status DeleteFile(std::unique_ptr&& handle) { + std::vector> tmp; + tmp.emplace_back(std::move(handle)); + return BatchDeleteFiles(tmp); + } + + // Batch version of DeleteFile + virtual Status BatchDeleteFiles( + const std::vector>& handles) { + (void)handles; + return Status::OK(); + } +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_reader.cc b/utilities/titandb/blob_file_reader.cc new file mode 100644 index 00000000000..848b63b2047 --- /dev/null +++ b/utilities/titandb/blob_file_reader.cc @@ -0,0 +1,192 @@ +#include "utilities/titandb/blob_file_reader.h" + +#include "util/crc32c.h" +#include "util/filename.h" +#include "utilities/titandb/util.h" + +namespace rocksdb { +namespace titandb { + +Status NewBlobFileReader(uint64_t file_number, uint64_t readahead_size, + const TitanDBOptions& db_options, + const EnvOptions& env_options, Env* env, + std::unique_ptr* result) { + std::unique_ptr file; + auto file_name = BlobFileName(db_options.dirname, file_number); + Status s = env->NewRandomAccessFile(file_name, &file, env_options); + if (!s.ok()) return s; + + if (readahead_size > 0) { + file = NewReadaheadRandomAccessFile(std::move(file), readahead_size); + } + result->reset(new RandomAccessFileReader(std::move(file), file_name)); + return s; +} + +const uint64_t kMinReadaheadSize = 4 << 10; +const uint64_t kMaxReadaheadSize = 256 << 10; + +// Represents a buffer cached in the blob cache. +class BlobBuffer { + public: + Slice slice() const { return Slice(data_.get(), size_); } + + size_t cache_size() const { return size_ + sizeof(size_); } + + void assign(std::unique_ptr data, size_t size) { + data_ = std::move(data); + size_ = size; + } + + void release(PinnableSlice* buffer) { + buffer->PinSlice(slice(), cleanup, data_.release(), nullptr); + } + + static void cleanup(void *arg1, void* /*arg2*/) { + delete[] reinterpret_cast(arg1); + } + + private: + std::unique_ptr data_; + size_t size_; +}; + +namespace { + +void GenerateCachePrefix(std::string* dst, Cache* cc, RandomAccessFile* file) { + char buffer[kMaxVarint64Length * 3 + 1]; + auto size = file->GetUniqueId(buffer, sizeof(buffer)); + if (size == 0) { + auto end = EncodeVarint64(buffer, cc->NewId()); + size = end - buffer; + } + dst->assign(buffer, size); +} + +void EncodeBlobCache(std::string* dst, const Slice& prefix, uint64_t offset) { + dst->assign(prefix.data(), prefix.size()); + PutVarint64(dst, offset); +} + +} + +Status BlobFileReader::Open(const TitanCFOptions& options, + std::unique_ptr file, + uint64_t file_size, + std::unique_ptr* result) { + if (file_size < BlobFileFooter::kEncodedLength) { + return Status::Corruption("file is too short to be a blob file"); + } + + char footer_space[BlobFileFooter::kEncodedLength]; + Slice footer_input; + Status s = file->Read(file_size - BlobFileFooter::kEncodedLength, + BlobFileFooter::kEncodedLength, + &footer_input, footer_space); + if (!s.ok()) return s; + + BlobFileFooter footer; + s = DecodeInto(footer_input, &footer); + if (!s.ok()) return s; + + auto reader = new BlobFileReader(options, std::move(file)); + reader->footer_ = footer; + result->reset(reader); + return s; +} + +BlobFileReader::BlobFileReader(const TitanCFOptions& options, + std::unique_ptr file) + : options_(options), + file_(std::move(file)), + cache_(options.blob_cache) { + if (cache_) { + GenerateCachePrefix(&cache_prefix_, cache_.get(), file_->file()); + } +} + +Status BlobFileReader::Get(const ReadOptions& /*options*/, + const BlobHandle& handle, + BlobRecord* record,PinnableSlice* buffer) { + Status s; + std::string cache_key; + Cache::Handle* cache_handle = nullptr; + if (cache_) { + EncodeBlobCache(&cache_key, cache_prefix_, handle.offset); + cache_handle = cache_->Lookup(cache_key); + if (cache_handle) { + auto blob = reinterpret_cast(cache_->Value(cache_handle)); + auto slice = blob->slice(); + buffer->PinSlice(slice, UnrefCacheHandle, cache_.get(), cache_handle); + return DecodeInto(slice, record); + } + } + + BlobBuffer blob; + s = ReadBlob(handle, &blob); + if (!s.ok()) return s; + s = DecodeInto(blob.slice(), record); + if (!s.ok()) return s; + + if (cache_) { + auto cache_value = new BlobBuffer(std::move(blob)); + cache_->Insert(cache_key, cache_value, cache_value->cache_size(), + &DeleteCacheValue, &cache_handle); + buffer->PinSlice(cache_value->slice(), UnrefCacheHandle, + cache_.get(), cache_handle); + } else { + blob.release(buffer); + } + + return s; +} + +Status BlobFileReader::ReadBlob(const BlobHandle& handle, BlobBuffer* buffer) { + Slice blob; + size_t blob_size =handle.size+ kBlobTailerSize; + std::unique_ptr compressed(new char[blob_size]); + Status s = file_->Read(handle.offset, blob_size, + &blob, compressed.get()); + if (!s.ok()) return s; + + auto tailer = blob.data() + handle.size; + auto checksum = DecodeFixed32(tailer + 1); + if (crc32c::Value(blob.data(), handle.size) != checksum) { + return Status::Corruption("BlobRecord", "checksum"); + } + auto compression = static_cast(*tailer); + if (compression == kNoCompression) { + buffer->assign(std::move(compressed), handle.size); + } else { + Slice output; + std::unique_ptr uncompressed; + s = Uncompress(compression, blob, &output, &uncompressed); + if (!s.ok()) return s; + buffer->assign(std::move(uncompressed), output.size()); + } + + return s; +} + +Status BlobFilePrefetcher::Get(const ReadOptions& options, + const BlobHandle& handle, + BlobRecord* record, PinnableSlice* buffer) { + if (handle.offset == last_offset_) { + last_offset_ = handle.offset + handle.size; + if (handle.offset + handle.size > readahead_limit_) { + readahead_size_ = std::max(handle.size, readahead_size_); + reader_->file_->Prefetch(handle.offset, readahead_size_); + readahead_limit_ = handle.offset + readahead_size_; + readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ * 2); + } + } else { + last_offset_ = handle.offset + handle.size; + readahead_size_ = 0; + readahead_limit_ = 0; + } + + return reader_->Get(options, handle, record, buffer); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_reader.h b/utilities/titandb/blob_file_reader.h new file mode 100644 index 00000000000..5817ca9f531 --- /dev/null +++ b/utilities/titandb/blob_file_reader.h @@ -0,0 +1,70 @@ +#pragma once + +#include "util/file_reader_writer.h" +#include "utilities/titandb/blob_format.h" +#include "utilities/titandb/options.h" + +namespace rocksdb { +namespace titandb { + +class BlobBuffer; + +Status NewBlobFileReader(uint64_t file_number, uint64_t readahead_size, + const TitanDBOptions& db_options, + const EnvOptions& env_options, Env* env, + std::unique_ptr* result); + +class BlobFileReader { + public: + // Opens a blob file and read the necessary metadata from it. + // If successful, sets "*result" to the newly opened file reader. + static Status Open(const TitanCFOptions& options, + std::unique_ptr file, + uint64_t file_size, + std::unique_ptr* result); + + // Gets the blob record pointed by the handle in this file. The data + // of the record is stored in the provided buffer, so the buffer + // must be valid when the record is used. + Status Get(const ReadOptions& options, const BlobHandle& handle, + BlobRecord* record, PinnableSlice* buffer); + + private: + friend class BlobFilePrefetcher; + + BlobFileReader(const TitanCFOptions& options, + std::unique_ptr file); + + Status ReadBlob(const BlobHandle& handle, BlobBuffer* buffer); + + TitanCFOptions options_; + std::unique_ptr file_; + + std::shared_ptr cache_; + std::string cache_prefix_; + + // Information read from the file. + BlobFileFooter footer_; +}; + +// Performs readahead on continuous reads. +class BlobFilePrefetcher : public Cleanable { + public: + // Constructs a prefetcher with the blob file reader. + // "*reader" must be valid when the prefetcher is used. + BlobFilePrefetcher(BlobFileReader* reader) + : reader_(reader) {} + + Status Get(const ReadOptions& options, + const BlobHandle& handle, + BlobRecord* record, PinnableSlice* buffer); + + private: + BlobFileReader* reader_; + uint64_t last_offset_ {0}; + uint64_t readahead_size_ {0}; + uint64_t readahead_limit_ {0}; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_size_collector.cc b/utilities/titandb/blob_file_size_collector.cc new file mode 100644 index 00000000000..cdb4604adf5 --- /dev/null +++ b/utilities/titandb/blob_file_size_collector.cc @@ -0,0 +1,138 @@ +#include "utilities/titandb/blob_file_size_collector.h" + +namespace rocksdb { +namespace titandb { + +TablePropertiesCollector* +BlobFileSizeCollectorFactory::CreateTablePropertiesCollector( + rocksdb::TablePropertiesCollectorFactory::Context /* context */) { + return new BlobFileSizeCollector(); +} + +const std::string BlobFileSizeCollector::kPropertiesName = + "TitanDB.blob_discardable_size"; + +bool BlobFileSizeCollector::Encode( + const std::map& blob_files_size, std::string* result) { + PutVarint32(result, static_cast(blob_files_size.size())); + for (const auto& bfs : blob_files_size) { + PutVarint64(result, bfs.first); + PutVarint64(result, bfs.second); + } + return true; +} +bool BlobFileSizeCollector::Decode( + Slice* slice, std::map* blob_files_size) { + uint32_t num = 0; + if (!GetVarint32(slice, &num)) { + return false; + } + uint64_t file_number; + uint64_t size; + for (uint32_t i = 0; i < num; ++i) { + if (!GetVarint64(slice, &file_number)) { + return false; + } + if (!GetVarint64(slice, &size)) { + return false; + } + (*blob_files_size)[file_number] = size; + } + return true; +} + +Status BlobFileSizeCollector::AddUserKey(const Slice& /* key */, + const Slice& value, EntryType type, + SequenceNumber /* seq */, + uint64_t /* file_size */) { + if (type != kEntryBlobIndex) { + return Status::OK(); + } + + BlobIndex index; + auto s = index.DecodeFrom(const_cast(&value)); + if (!s.ok()) { + return s; + } + + auto iter = blob_files_size_.find(index.file_number); + if (iter == blob_files_size_.end()) { + blob_files_size_[index.file_number] = index.blob_handle.size; + } else { + iter->second += index.blob_handle.size; + } + + return Status::OK(); +} + +Status BlobFileSizeCollector::Finish(UserCollectedProperties* properties) { + std::string res; + Encode(blob_files_size_, &res); + *properties = UserCollectedProperties{{kPropertiesName, res}}; + return Status::OK(); +} + +BlobDiscardableSizeListener::BlobDiscardableSizeListener(TitanDBImpl* db, + port::Mutex* db_mutex, + VersionSet* versions) + : db_(db), db_mutex_(db_mutex), versions_(versions) {} + +BlobDiscardableSizeListener::~BlobDiscardableSizeListener() {} + +void BlobDiscardableSizeListener::OnCompactionCompleted( + rocksdb::DB* /* db */, const CompactionJobInfo& ci) { + std::map blob_files_size; + auto calc_bfs = [&ci, &blob_files_size](const std::vector& files, + int coefficient) { + for (const auto& file : files) { + auto tp_iter = ci.table_properties.find(file); + if (tp_iter == ci.table_properties.end()) { + continue; + } + auto ucp_iter = tp_iter->second->user_collected_properties.find( + BlobFileSizeCollector::kPropertiesName); + if (ucp_iter == tp_iter->second->user_collected_properties.end()) { + continue; + } + std::map input_blob_files_size; + std::string s = ucp_iter->second; + Slice slice{s}; + BlobFileSizeCollector::Decode(&slice, &input_blob_files_size); + for (const auto& input_bfs : input_blob_files_size) { + auto bfs_iter = blob_files_size.find(input_bfs.first); + if (bfs_iter == blob_files_size.end()) { + blob_files_size[input_bfs.first] = coefficient * input_bfs.second; + } else { + bfs_iter->second += coefficient * input_bfs.second; + } + } + } + }; + + calc_bfs(ci.input_files, -1); + calc_bfs(ci.output_files, 1); + + { + MutexLock l(db_mutex_); + Version* current = versions_->current(); + current->Ref(); + auto bs = current->GetBlobStorage(ci.cf_id).lock(); + for (const auto& bfs : blob_files_size) { + // blob file size < 0 means discardable size > 0 + if (bfs.second > 0) { + continue; + } + auto file = bs->FindFile(bfs.first).lock(); + file->discardable_size += static_cast(-bfs.second); + } + bs->ComputeGCScore(); + current->Unref(); + if (db_ != nullptr) { + db_->AddToGCQueue(ci.cf_id); + db_->MaybeScheduleGC(); + } + } +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_file_size_collector.h b/utilities/titandb/blob_file_size_collector.h new file mode 100644 index 00000000000..c5479c6b574 --- /dev/null +++ b/utilities/titandb/blob_file_size_collector.h @@ -0,0 +1,61 @@ +#ifndef ROCKSDB_BLOB_GC_STATISTIS_H +#define ROCKSDB_BLOB_GC_STATISTIS_H + +#include "rocksdb/listener.h" +#include "rocksdb/table_properties.h" +#include "util/coding.h" +#include "utilities/titandb/db_impl.h" +#include "utilities/titandb/version.h" +#include "utilities/titandb/version_set.h" + +namespace rocksdb { +namespace titandb { + +class BlobFileSizeCollectorFactory final + : public TablePropertiesCollectorFactory { + public: + TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override; + + const char* Name() const override { return "BlobFileSizeCollector"; } +}; + +class BlobFileSizeCollector final : public TablePropertiesCollector { + public: + const static std::string kPropertiesName; + + static bool Encode(const std::map& blob_files_size, + std::string* result); + static bool Decode(Slice* slice, + std::map* blob_files_size); + + Status AddUserKey(const Slice& key, const Slice& value, EntryType type, + SequenceNumber seq, uint64_t file_size) override; + Status Finish(UserCollectedProperties* properties) override; + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties(); + } + const char* Name() const override { return "BlobFileSizeCollector"; } + + private: + std::map blob_files_size_; +}; + +class BlobDiscardableSizeListener final : public EventListener { + public: + BlobDiscardableSizeListener(TitanDBImpl* db, port::Mutex* db_mutex, + VersionSet* versions); + ~BlobDiscardableSizeListener(); + + void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override; + + private: + TitanDBImpl* db_; + port::Mutex* db_mutex_; + VersionSet* versions_; +}; + +} // namespace titandb +} // namespace rocksdb + +#endif // ROCKSDB_BLOB_GC_STATISTIS_H diff --git a/utilities/titandb/blob_file_size_collector_test.cc b/utilities/titandb/blob_file_size_collector_test.cc new file mode 100644 index 00000000000..65874daac97 --- /dev/null +++ b/utilities/titandb/blob_file_size_collector_test.cc @@ -0,0 +1,96 @@ +#include "utilities/titandb/blob_file_size_collector.h" + +#include "util/filename.h" +#include "util/testharness.h" +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_cache.h" +#include "utilities/titandb/blob_gc_picker.h" +#include "utilities/titandb/version_set.h" + +namespace rocksdb { +namespace titandb { + +const static uint32_t kDefauleColumnFamilyID = 0x77; + +class BlobFileSizeCollectorTest : public testing::Test { + public: + port::Mutex mutex_; + std::unique_ptr vset_; + + BlobFileSizeCollectorTest() {} + ~BlobFileSizeCollectorTest() {} + + void NewVersionSet(const TitanDBOptions& titan_db_options, + const TitanCFOptions& titan_cf_options) { + auto blob_file_cache = std::make_shared( + titan_db_options, titan_cf_options, NewLRUCache(128)); + auto v = new Version(vset_.get()); + auto storage = + std::make_shared(TitanCFOptions(), blob_file_cache); + v->column_families_.emplace(kDefauleColumnFamilyID, storage); + vset_.reset(new VersionSet(titan_db_options)); + vset_->versions_.Append(v); + } + + void AddBlobFile(uint64_t file_number, uint64_t file_size, + uint64_t discardable_size, bool being_gc = false) { + vset_->current() + ->column_families_[kDefauleColumnFamilyID] + ->files_[file_number] = std::make_shared( + file_number, file_size, discardable_size, being_gc); + } + + void TestBasic() { + NewVersionSet(TitanDBOptions(), TitanCFOptions()); + CompactionJobInfo cji; + cji.cf_id = kDefauleColumnFamilyID; + AddBlobFile(1, 100, 5); + auto file = vset_->current()->GetBlobStorage(kDefauleColumnFamilyID).lock()->files_[1]; + ASSERT_EQ(file->discardable_size, 5); + TablePropertiesCollectorFactory::Context context; + context.column_family_id = kDefauleColumnFamilyID; + BlobFileSizeCollectorFactory factory; + std::shared_ptr c( + factory.CreateTablePropertiesCollector(context)); + BlobIndex bi; + bi.file_number = 1; + bi.blob_handle.size = 80; + std::string tmp; + bi.EncodeTo(&tmp); + ASSERT_OK(c->AddUserKey("random", tmp, EntryType::kEntryBlobIndex, 0, 0)); + std::shared_ptr tp = std::make_shared(); + UserCollectedProperties u; + c->Finish(&u); + tp->user_collected_properties.insert(u.begin(), u.end()); + cji.table_properties["1"] = tp; + cji.input_files.emplace_back("1"); + c.reset(factory.CreateTablePropertiesCollector(context)); + bi.file_number = 1; + bi.blob_handle.size = 60; + tmp.clear(); + bi.EncodeTo(&tmp); + ASSERT_OK(c->AddUserKey("random2", tmp, EntryType::kEntryBlobIndex, 0, 0)); + u.clear(); + c->Finish(&u); + std::shared_ptr tp2 = std::make_shared(); + tp2->user_collected_properties.insert(u.begin(), u.end()); + cji.table_properties["2"] = tp2; + cji.output_files.emplace_back("2"); + port::Mutex mutex; + BlobDiscardableSizeListener listener(nullptr, &mutex, vset_.get()); + listener.OnCompactionCompleted(nullptr, cji); + ASSERT_EQ(file->discardable_size, 25); + } +}; + +TEST_F(BlobFileSizeCollectorTest, Basic) { + TestBasic(); +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/blob_file_test.cc b/utilities/titandb/blob_file_test.cc new file mode 100644 index 00000000000..09af41beb0c --- /dev/null +++ b/utilities/titandb/blob_file_test.cc @@ -0,0 +1,101 @@ +#include "util/filename.h" +#include "util/testharness.h" +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_cache.h" +#include "utilities/titandb/blob_file_reader.h" + +namespace rocksdb { +namespace titandb { + +class BlobFileTest : public testing::Test { + public: + BlobFileTest() : dirname_(test::TmpDir(env_)) { + file_name_ = BlobFileName(dirname_, file_number_); + } + + ~BlobFileTest() { + env_->DeleteFile(file_name_); + env_->DeleteDir(dirname_); + } + + void TestBlobFile(TitanOptions options) { + options.dirname = dirname_; + TitanDBOptions db_options(options); + TitanCFOptions cf_options(options); + BlobFileCache cache(db_options, cf_options, {NewLRUCache(128)}); + + const int n = 100; + std::vector handles(n); + + std::unique_ptr file; + { + std::unique_ptr f; + ASSERT_OK(env_->NewWritableFile(file_name_, &f, env_options_)); + file.reset(new WritableFileWriter(std::move(f), env_options_)); + } + std::unique_ptr builder( + new BlobFileBuilder(cf_options, file.get())); + + for (int i = 0; i < n; i++) { + auto id = std::to_string(i); + BlobRecord record; + record.key = id; + record.value = id; + builder->Add(record, &handles[i]); + ASSERT_OK(builder->status()); + } + ASSERT_OK(builder->Finish()); + ASSERT_OK(builder->status()); + + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file_name_, &file_size)); + + ReadOptions ro; + std::unique_ptr prefetcher; + ASSERT_OK(cache.NewPrefetcher(file_number_, file_size, &prefetcher)); + for (int i = 0; i < n; i++) { + auto id = std::to_string(i); + BlobRecord expect; + expect.key = id; + expect.value = id; + BlobRecord record; + PinnableSlice buffer; + ASSERT_OK(cache.Get(ro, file_number_, file_size, handles[i], + &record, &buffer)); + ASSERT_EQ(record, expect); + buffer.Reset(); + ASSERT_OK( + cache.Get(ro, file_number_, file_size, handles[i], &record, &buffer)); + ASSERT_EQ(record, expect); + buffer.Reset(); + ASSERT_OK(prefetcher->Get(ro, handles[i], &record, &buffer)); + ASSERT_EQ(record, expect); + buffer.Reset(); + ASSERT_OK(prefetcher->Get(ro, handles[i], &record, &buffer)); + ASSERT_EQ(record, expect); + } + } + + Env* env_{Env::Default()}; + EnvOptions env_options_; + std::string dirname_; + std::string file_name_; + uint64_t file_number_{1}; +}; + +TEST_F(BlobFileTest, Basic) { + TitanOptions options; + TestBlobFile(options); + options.blob_cache = NewLRUCache(1 << 20); + TestBlobFile(options); + options.blob_file_compression = kZSTD; + TestBlobFile(options); +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/blob_format.cc b/utilities/titandb/blob_format.cc new file mode 100644 index 00000000000..8b5b7622c13 --- /dev/null +++ b/utilities/titandb/blob_format.cc @@ -0,0 +1,130 @@ +#include "utilities/titandb/blob_format.h" + +#include "util/crc32c.h" + +namespace rocksdb { +namespace titandb { + +namespace { + +bool GetChar(Slice* src, unsigned char* value) { + if (src->size() < 1) return false; + *value = *src->data(); + src->remove_prefix(1); + return true; +} + +} // namespace + +void BlobRecord::EncodeTo(std::string* dst) const { + PutLengthPrefixedSlice(dst, key); + PutLengthPrefixedSlice(dst, value); +} + +Status BlobRecord::DecodeFrom(Slice* src) { + if (!GetLengthPrefixedSlice(src, &key) || + !GetLengthPrefixedSlice(src, &value)) { + return Status::Corruption("BlobRecord"); + } + return Status::OK(); +} + +bool operator==(const BlobRecord& lhs, const BlobRecord& rhs) { + return lhs.key == rhs.key && lhs.value == rhs.value; +} + +void BlobHandle::EncodeTo(std::string* dst) const { + PutVarint64(dst, offset); + PutVarint64(dst, size); +} + +Status BlobHandle::DecodeFrom(Slice* src) { + if (!GetVarint64(src, &offset) || + !GetVarint64(src, &size)) { + return Status::Corruption("BlobHandle"); + } + return Status::OK(); +} + +bool operator==(const BlobHandle& lhs, const BlobHandle& rhs) { + return lhs.offset == rhs.offset && lhs.size == rhs.size; +} + +void BlobIndex::EncodeTo(std::string* dst) const { + dst->push_back(kBlobRecord); + PutVarint64(dst, file_number); + blob_handle.EncodeTo(dst); +} + +Status BlobIndex::DecodeFrom(Slice* src) { + unsigned char type; + if (!GetChar(src, &type) || type != kBlobRecord || + !GetVarint64(src, &file_number)) { + return Status::Corruption("BlobIndex"); + } + Status s = blob_handle.DecodeFrom(src); + if (!s.ok()) { + return Status::Corruption("BlobIndex", s.ToString()); + } + return s; +} + +bool operator==(const BlobIndex& lhs, const BlobIndex& rhs) { + return (lhs.file_number == rhs.file_number && + lhs.blob_handle == rhs.blob_handle); +} + +void BlobFileMeta::EncodeTo(std::string* dst) const { + PutVarint64(dst, file_number); + PutVarint64(dst, file_size); +} + +Status BlobFileMeta::DecodeFrom(Slice* src) { + if (!GetVarint64(src, &file_number) || !GetVarint64(src, &file_size)) { + return Status::Corruption("BlobFileMeta"); + } + return Status::OK(); +} + +bool operator==(const BlobFileMeta& lhs, const BlobFileMeta& rhs) { + return (lhs.file_number == rhs.file_number && lhs.file_size == rhs.file_size); +} + +void BlobFileFooter::EncodeTo(std::string* dst) const { + auto size = dst->size(); + meta_index_handle.EncodeTo(dst); + // Add padding to make a fixed size footer. + dst->resize(size + kEncodedLength - 12); + PutFixed64(dst, kMagicNumber); + Slice encoded(dst->data() + size, dst->size() - size); + PutFixed32(dst, crc32c::Value(encoded.data(), encoded.size())); +} + +Status BlobFileFooter::DecodeFrom(Slice* src) { + auto data = src->data(); + Status s = meta_index_handle.DecodeFrom(src); + if (!s.ok()) { + return Status::Corruption("BlobFileFooter", s.ToString()); + } + // Remove padding. + src->remove_prefix(data + kEncodedLength - 12 - src->data()); + uint64_t magic_number = 0; + if (!GetFixed64(src, &magic_number) || magic_number != kMagicNumber) { + return Status::Corruption("BlobFileFooter", "magic number"); + } + Slice decoded(data, src->data() - data); + uint32_t checksum = 0; + if (!GetFixed32(src, &checksum) || + crc32c::Value(decoded.data(), decoded.size()) != checksum) { + return Status::Corruption("BlobFileFooter", "checksum"); + } + return Status::OK(); +} + +bool operator==(const BlobFileFooter& lhs, const BlobFileFooter& rhs) { + return (lhs.meta_index_handle.offset() == rhs.meta_index_handle.offset() && + lhs.meta_index_handle.size() == rhs.meta_index_handle.size()); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_format.h b/utilities/titandb/blob_format.h new file mode 100644 index 00000000000..8a140c3807a --- /dev/null +++ b/utilities/titandb/blob_format.h @@ -0,0 +1,134 @@ +#pragma once + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/options.h" +#include "table/format.h" + +namespace rocksdb { +namespace titandb { + +// 8 bytes body length +const uint32_t kBlobHeaderSize = 8; + +// compression : char +// checksum : fixed32 +const uint32_t kBlobTailerSize = 5; + +const uint32_t kBlobFixedSize = kBlobHeaderSize + kBlobTailerSize; + +// Blob record format: +// +// key : varint64 length + length bytes +// value : varint64 length + length bytes +struct BlobRecord { + Slice key; + Slice value; + struct MetaData { + SequenceNumber seq_num; + } metadata; + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* src); + + friend bool operator==(const BlobRecord& lhs, const BlobRecord& rhs); +}; + +// Blob handle format: +// +// offset : varint64 +// size : varint64 +struct BlobHandle { + uint64_t offset{0}; + uint64_t size{0}; + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* src); + + friend bool operator==(const BlobHandle& lhs, const BlobHandle& rhs); +}; + +// Blob index format: +// +// type : char +// file_number : varint64 +// blob_handle : varint64 offset + varint64 size +struct BlobIndex { + enum Type : unsigned char { + kBlobRecord = 1, + }; + uint64_t file_number{0}; + BlobHandle blob_handle; + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* src); + + friend bool operator==(const BlobIndex& lhs, const BlobIndex& rhs); +}; + +// Blob file meta format: +// +// file_number : varint64 +// file_size : varint64 +struct BlobFileMeta { + BlobFileMeta(){}; + BlobFileMeta(uint64_t _file_number, uint64_t _file_size, + uint64_t _discardable_size = 0, bool _being_gc = false, + bool _marked_for_sample = true) + : file_number(_file_number), + file_size(_file_size), + discardable_size(_discardable_size), + marked_for_sample(_marked_for_sample), + being_gc(_being_gc) {} + + // Persistent field, we should never modify it. + uint64_t file_number{0}; + uint64_t file_size{0}; + + // Not persistent field + // These fields maybe are mutate, need to be protected by db.mutex_ + uint64_t discardable_size{0}; + bool marked_for_gc = false; + bool marked_for_sample = true; + + // This field can be modified concurrently + std::atomic_bool being_gc{false}; + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* src); + + friend bool operator==(const BlobFileMeta& lhs, const BlobFileMeta& rhs); +}; + +// Blob file footer format: +// +// meta_index_handle : varint64 offset + varint64 size +// : [... kEncodedLength - 12] bytes +// magic_number : fixed64 +// checksum : fixed32 +struct BlobFileFooter { + // The first 64bits from $(echo titandb/blob | sha1sum). + static const uint64_t kMagicNumber {0xcd3f52ea0fe14511ull}; + static const uint64_t kEncodedLength{BlockHandle::kMaxEncodedLength + 8 + 4}; + + BlockHandle meta_index_handle {BlockHandle::NullBlockHandle()}; + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* src); + + friend bool operator==(const BlobFileFooter& lhs, const BlobFileFooter& rhs); +}; + +// A convenient template to decode a const slice. +template +Status DecodeInto(const Slice& src, T* target) { + auto tmp = src; + auto s = target->DecodeFrom(&tmp); + if (s.ok() && !tmp.empty()) { + s = Status::Corruption(Slice()); + } + return s; +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_format_test.cc b/utilities/titandb/blob_format_test.cc new file mode 100644 index 00000000000..4d6bc647432 --- /dev/null +++ b/utilities/titandb/blob_format_test.cc @@ -0,0 +1,57 @@ +#include "util/testharness.h" +#include "utilities/titandb/util.h" +#include "utilities/titandb/blob_format.h" + +namespace rocksdb { +namespace titandb { + +class BlobFormatTest : public testing::Test {}; + +TEST(BlobFormatTest, BlobRecord) { + BlobRecord input; + CheckCodec(input); + input.key = "hello"; + input.value = "world"; + CheckCodec(input); +} + +TEST(BlobFormatTest, BlobHandle) { + BlobHandle input; + CheckCodec(input); + input.offset = 2; + input.size = 3; + CheckCodec(input); +} + +TEST(BlobFormatTest, BlobIndex) { + BlobIndex input; + CheckCodec(input); + input.file_number = 1; + input.blob_handle.offset = 2; + input.blob_handle.size = 3; + CheckCodec(input); +} + +TEST(BlobFormatTest, BlobFileMeta) { + BlobFileMeta input; + CheckCodec(input); + input.file_number = 2; + input.file_size = 3; + CheckCodec(input); +} + +TEST(BlobFormatTest, BlobFileFooter) { + BlobFileFooter input; + CheckCodec(input); + input.meta_index_handle.set_offset(123); + input.meta_index_handle.set_size(321); + CheckCodec(input); +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/blob_gc.cc b/utilities/titandb/blob_gc.cc new file mode 100644 index 00000000000..490496554df --- /dev/null +++ b/utilities/titandb/blob_gc.cc @@ -0,0 +1,12 @@ +#include "utilities/titandb/blob_gc.h" + +namespace rocksdb { +namespace titandb { + +BlobGC::BlobGC(std::vector&& blob_files) + : candidate_files_(std::move(blob_files)) {} + +BlobGC::~BlobGC() {} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_gc.h b/utilities/titandb/blob_gc.h new file mode 100644 index 00000000000..c64c1008ba7 --- /dev/null +++ b/utilities/titandb/blob_gc.h @@ -0,0 +1,42 @@ +#ifndef ROCKSDB_BLOB_GC_H +#define ROCKSDB_BLOB_GC_H + +#include + +#include "utilities/titandb/blob_format.h" + +namespace rocksdb { +namespace titandb { + +// A BlobGC encapsulates information about a blob gc. +class BlobGC { + public: + BlobGC(std::vector&& blob_files); + ~BlobGC(); + + const std::vector& candidate_files() { + return candidate_files_; + } + + void set_selected_files(std::vector&& files) { + selected_files_ = std::move(files); + } + + const std::vector& selected_files() { return selected_files_; } + + void ClearSelectedFiles() { selected_files_.clear(); } + + private: + std::vector candidate_files_; + std::vector selected_files_; +}; + +struct GCScore { + uint64_t file_number; + double score; +}; + +} // namespace titandb +} // namespace rocksdb + +#endif // ROCKSDB_BLOB_GC_H diff --git a/utilities/titandb/blob_gc_job.cc b/utilities/titandb/blob_gc_job.cc new file mode 100644 index 00000000000..51144e12640 --- /dev/null +++ b/utilities/titandb/blob_gc_job.cc @@ -0,0 +1,371 @@ +#include "utilities/titandb/blob_gc_job.h" + +#include "db/column_family.h" +#include "db/db_impl.h" +#include "table/internal_iterator.h" +#include "table/merging_iterator.h" +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_iterator.h" +#include "utilities/titandb/blob_file_manager.h" +#include "utilities/titandb/blob_file_reader.h" +#include "utilities/titandb/version.h" +#include "utilities/titandb/version_edit.h" + +namespace rocksdb { +namespace titandb { + +// Write callback for garbage collection to check if key has been updated +// since last read. Similar to how OptimisticTransaction works. +class BlobGCJob::GarbageCollectionWriteCallback : public WriteCallback { + public: + GarbageCollectionWriteCallback(ColumnFamilyData* cfd, std::string&& key, + SequenceNumber upper_bound) + : cfd_(cfd), key_(std::move(key)), upper_bound_(upper_bound) {} + + virtual Status Callback(DB* db) override { + auto* db_impl = reinterpret_cast(db); + auto* sv = db_impl->GetAndRefSuperVersion(cfd_); + SequenceNumber latest_seq = 0; + bool found_record_for_key = false; + bool is_blob_index = false; + Status s = db_impl->GetLatestSequenceForKey( + sv, key_, false /*cache_only*/, &latest_seq, &found_record_for_key, + &is_blob_index); + db_impl->ReturnAndCleanupSuperVersion(cfd_, sv); + if (!s.ok() && !s.IsNotFound()) { + // Error. + assert(!s.IsBusy()); + return s; + } + if (s.IsNotFound()) { + // Deleted + assert(!found_record_for_key); + return Status::Busy("Key deleted"); + } + assert(found_record_for_key); + if (!is_blob_index || latest_seq > upper_bound_) { + return Status::Busy("Key overwritten"); + } + return s; + } + + virtual bool AllowWriteBatching() override { return false; } + + private: + ColumnFamilyData* cfd_; + // Key to check + std::string key_; + // Upper bound of sequence number to proceed. + SequenceNumber upper_bound_; +}; + +BlobGCJob::BlobGCJob(BlobGC* blob_gc, DB* db, ColumnFamilyHandle* cfh, + port::Mutex* mutex, const TitanDBOptions& titan_db_options, + const TitanCFOptions& titan_cf_options, Env* env, + const EnvOptions& env_options, + BlobFileManager* blob_file_manager, + VersionSet* version_set) + : blob_gc_(blob_gc), + base_db_(db), + base_db_impl_(reinterpret_cast(base_db_)), + cfh_(cfh), + tdb_mutex_(mutex), + titan_db_options_(titan_db_options), + titan_cf_options_(titan_cf_options), + env_(env), + env_options_(env_options), + blob_file_manager_(blob_file_manager), + version_set_(version_set) {} + +BlobGCJob::~BlobGCJob() { + if (cmp_) delete cmp_; +} + +Status BlobGCJob::Prepare() { return Status::OK(); } + +Status BlobGCJob::Run() { + Status s; + + s = SampleCandidateFiles(); + if (!s.ok()) return s; + + s = DoRunGC(); + if (!s.ok()) return s; + + return Status::OK(); +} + +Status BlobGCJob::SampleCandidateFiles() { + std::vector result; + for (const auto& file : blob_gc_->candidate_files()) { + if (!file->marked_for_sample || DoSample(file)) { + result.push_back(file); + } + } + + if (result.empty()) return Status::Aborted("No blob file need to be gc"); + + blob_gc_->set_selected_files(std::move(result)); + + return Status::OK(); +} + +bool BlobGCJob::DoSample(const BlobFileMeta* file) { + Status s; + uint64_t sample_size_window = static_cast( + file->file_size * titan_cf_options_.sample_flie_size_ratio); + Random64 random64(file->file_size); + uint64_t sample_begin_offset = + random64.Uniform(file->file_size - sample_size_window); + + std::unique_ptr file_reader; + s = NewBlobFileReader(file->file_number, 0, titan_db_options_, env_options_, + env_, &file_reader); + assert(s.ok()); + BlobFileIterator iter(std::move(file_reader), file->file_number, + file->file_size, titan_cf_options_); + iter.IterateForPrev(sample_begin_offset); + assert(iter.status().ok()); + + uint64_t iterated_size{0}; + uint64_t discardable_size{0}; + for (iter.Next(); + iterated_size < sample_size_window && iter.status().ok() && iter.Valid(); + iter.Next()) { + BlobIndex blob_index = iter.GetBlobIndex(); + uint64_t total_length = blob_index.blob_handle.size + kBlobFixedSize; + iterated_size += total_length; + if (DiscardEntry(iter.key(), blob_index)) { + discardable_size += total_length; + } + } + assert(iter.status().ok()); + + return discardable_size >= + sample_size_window * titan_cf_options_.blob_file_discardable_ratio; +} + +Status BlobGCJob::DoRunGC() { + Status s; + + std::unique_ptr gc_iter; + s = BuildIterator(&gc_iter); + if (!s.ok()) return s; + if (!gc_iter) return Status::Aborted("Build iterator for gc failed"); + + // Similar to OptimisticTransaction, we obtain latest_seq from + // base DB, which is guaranteed to be no smaller than the sequence of + // current key. We use a WriteCallback on write to check the key sequence + // on write. If the key sequence is larger than latest_seq, we know + // a new versions is inserted and the old blob can be discard. + // + // We cannot use OptimisticTransaction because we need to pass + // is_blob_index flag to GetImpl. + std::unique_ptr blob_file_handle; + std::unique_ptr blob_file_builder; + auto* cfd = reinterpret_cast(this->cfh_)->cfd(); + for (gc_iter->SeekToFirst(); gc_iter->status().ok() && gc_iter->Valid(); + gc_iter->Next()) { + // This API is very lightweight + SequenceNumber latest_seq = base_db_->GetLatestSequenceNumber(); + + BlobIndex blob_index = gc_iter->GetBlobIndex(); + if (DiscardEntry(gc_iter->key(), blob_index)) { + continue; + } + + // Rewrite entry to new blob file + if (!blob_file_handle && !blob_file_builder) { + s = blob_file_manager_->NewFile(&blob_file_handle); + if (!s.ok()) { + break; + } + blob_file_builder = unique_ptr( + new BlobFileBuilder(titan_cf_options_, blob_file_handle->GetFile())); + } + assert(blob_file_handle); + assert(blob_file_builder); + + BlobRecord blob_record; + blob_record.key = gc_iter->key(); + blob_record.value = gc_iter->value(); + blob_index.file_number = blob_file_handle->GetNumber(); + blob_file_builder->Add(blob_record, &blob_index.blob_handle); + std::string index_entry; + blob_index.EncodeTo(&index_entry); + + // Store WriteBatch for rewriting new Key-Index pairs to LSM + rewrite_batches_.emplace_back(std::make_pair( + WriteBatch(), + GarbageCollectionWriteCallback{ + cfd, std::string(blob_record.key.data(), blob_record.key.size()), + latest_seq})); + auto& wb = rewrite_batches_.back().first; + s = WriteBatchInternal::PutBlobIndex(&wb, cfh_->GetID(), blob_record.key, + index_entry); + if (!s.ok()) { + break; + } + } + + if (gc_iter->status().ok() && s.ok()) { + if (blob_file_builder && blob_file_handle) { + assert(blob_file_builder->status().ok()); + blob_file_builders_.emplace_back(std::make_pair( + std::move(blob_file_handle), std::move(blob_file_builder))); + } else { + assert(!blob_file_builder); + assert(!blob_file_handle); + } + } else if (!gc_iter->status().ok()) { + return gc_iter->status(); + } + + return s; +} + +Status BlobGCJob::BuildIterator(unique_ptr* result) { + Status s; + const auto& inputs = blob_gc_->selected_files(); + assert(!inputs.empty()); + std::vector> list; + for (std::size_t i = 0; i < inputs.size(); ++i) { + std::unique_ptr file; + s = NewBlobFileReader(inputs[i]->file_number, 0, titan_db_options_, + env_options_, env_, &file); + if (!s.ok()) { + break; + } + list.emplace_back(std::unique_ptr( + new BlobFileIterator(std::move(file), inputs[i]->file_number, + inputs[i]->file_size, titan_cf_options_))); + } + + if (s.ok()) result->reset(new BlobFileMergeIterator(std::move(list))); + + return s; +} + +bool BlobGCJob::DiscardEntry(const Slice& key, const BlobIndex& blob_index) { + PinnableSlice index_entry; + bool is_blob_index; + auto s = base_db_impl_->GetImpl(ReadOptions(), cfh_, key, &index_entry, + nullptr /*value_found*/, + nullptr /*read_callback*/, &is_blob_index); + if (!s.ok() && !s.IsNotFound()) { + return true; + } + if (s.IsNotFound() || !is_blob_index) { + // Either the key is deleted or updated with a newer version which is + // inlined in LSM. + return true; + } + + BlobIndex other_blob_index; + s = other_blob_index.DecodeFrom(&index_entry); + if (!s.ok()) { + return true; + } + + return !(blob_index == other_blob_index); +} + +// We have to make sure crash consistency, but LSM db MANIFEST and BLOB db +// MANIFEST are separate, so we need to make sure all new blob file have +// added to db before we rewrite any key to LSM +Status BlobGCJob::Finish() { + Status s; + { + tdb_mutex_->Unlock(); + + s = InstallOutputBlobFiles(); + + if (s.ok()) s = RewriteValidKeyToLSM(); + + if (!s.ok()) blob_gc_->ClearSelectedFiles(); + + tdb_mutex_->Lock(); + } + + // TODO(@DorianZheng) cal discardable size for new blob file + + if (s.ok()) { + s = DeleteInputBlobFiles(); + } + + // We need to unset being_gc mark for unselected candidate blob files + for (auto& file : blob_gc_->candidate_files()) { + bool selected = false; + for (std::size_t i = 0; i < blob_gc_->selected_files().size(); i++) { + if (*blob_gc_->selected_files()[i] == *file) { + selected = true; + break; + } + } + if (!selected) file->being_gc.store(false, std::memory_order_release); + } + + return s; +} + +Status BlobGCJob::InstallOutputBlobFiles() { + Status s; + for (auto& builder : this->blob_file_builders_) { + s = builder.second->Finish(); + if (!s.ok()) { + break; + } + } + if (s.ok()) { + std::vector, + std::unique_ptr>> + files; + for (auto& builder : this->blob_file_builders_) { + auto file = std::make_shared(); + file->file_number = builder.first->GetNumber(); + file->file_size = builder.first->GetFile()->GetFileSize(); + files.emplace_back(make_pair(move(file), std::move(builder.first))); + } + this->blob_file_manager_->BatchFinishFiles(this->cfh_->GetID(), files); + } else { + std::vector> handles; + for (auto& builder : this->blob_file_builders_) + handles.emplace_back(std::move(builder.first)); + this->blob_file_manager_->BatchDeleteFiles(handles); + } + return s; +} + +Status BlobGCJob::RewriteValidKeyToLSM() { + Status s; + auto* db_impl = reinterpret_cast(this->base_db_); + for (auto& write_batch : this->rewrite_batches_) { + s = db_impl->WriteWithCallback(WriteOptions(), &write_batch.first, + &write_batch.second); + if (s.ok()) { + // Key is successfully written to LSM + } else if (s.IsBusy()) { + // The key is overwritten in the meanwhile. Drop the blob record. + } else { + // We hit an error. + break; + } + } + return s; +} + +Status BlobGCJob::DeleteInputBlobFiles() const { + Status s; + VersionEdit edit; + edit.SetColumnFamilyID(cfh_->GetID()); + for (const auto& file : blob_gc_->selected_files()) { + edit.DeleteBlobFile(file->file_number); + } + s = version_set_->LogAndApply(&edit, this->tdb_mutex_); + // TODO(@DorianZheng) Purge pending outputs + // base_db_->pending_outputs_.erase(handle->GetNumber()); + return s; +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_gc_job.h b/utilities/titandb/blob_gc_job.h new file mode 100644 index 00000000000..f3e5d394da1 --- /dev/null +++ b/utilities/titandb/blob_gc_job.h @@ -0,0 +1,68 @@ +#ifndef ROCKSDB_BLOB_GC_JOB_H +#define ROCKSDB_BLOB_GC_JOB_H + +#include "db/db_impl.h" +#include "rocksdb/status.h" +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_iterator.h" +#include "utilities/titandb/blob_file_manager.h" +#include "utilities/titandb/blob_gc.h" +#include "utilities/titandb/options.h" +#include "utilities/titandb/version_set.h" + +namespace rocksdb { +namespace titandb { + +class BlobGCJob { + public: + BlobGCJob(BlobGC* blob_gc, DB* db, ColumnFamilyHandle* cfh, + port::Mutex* mutex, const TitanDBOptions& titan_db_options, + const TitanCFOptions& titan_cf_options, Env* env, + const EnvOptions& env_options, BlobFileManager* blob_file_manager, + VersionSet* version_set); + ~BlobGCJob(); + + // REQUIRE: mutex held + Status Prepare(); + // REQUIRE: mutex not held + Status Run(); + // REQUIRE: mutex held + Status Finish(); + + private: + class GarbageCollectionWriteCallback; + friend class BlobGCJobTest; + + BlobGC* blob_gc_; + DB* base_db_; + DBImpl* base_db_impl_; + ColumnFamilyHandle* cfh_; + port::Mutex* tdb_mutex_; + TitanDBOptions titan_db_options_; + TitanCFOptions titan_cf_options_; + Env* env_; + EnvOptions env_options_; + BlobFileManager* blob_file_manager_; + titandb::VersionSet* version_set_; + + std::vector, + std::unique_ptr>> + blob_file_builders_; + std::vector> + rewrite_batches_; + InternalKeyComparator* cmp_{nullptr}; + + Status SampleCandidateFiles(); + bool DoSample(const BlobFileMeta* file); + Status DoRunGC(); + Status BuildIterator(std::unique_ptr* result); + bool DiscardEntry(const Slice& key, const BlobIndex& blob_index); + Status InstallOutputBlobFiles(); + Status RewriteValidKeyToLSM(); + Status DeleteInputBlobFiles() const; +}; + +} // namespace titandb +} // namespace rocksdb + +#endif // ROCKSDB_BLOB_GC_JOB_H diff --git a/utilities/titandb/blob_gc_job_test.cc b/utilities/titandb/blob_gc_job_test.cc new file mode 100644 index 00000000000..eb82039378b --- /dev/null +++ b/utilities/titandb/blob_gc_job_test.cc @@ -0,0 +1,217 @@ +#include "utilities/titandb/blob_gc_job.h" + +#include "util/testharness.h" +#include "utilities/titandb/blob_gc_picker.h" +#include "utilities/titandb/db_impl.h" + +namespace rocksdb { +namespace titandb { + +const static int MAX_KEY_NUM = 1000; + +class BlobGCJobTest : public testing::Test { + public: + std::string dbname_; + TitanDB* db_; + DBImpl* base_db_; + TitanDBImpl* tdb_; + VersionSet* version_set_; + TitanOptions options_; + port::Mutex* mutex_; + + BlobGCJobTest() : dbname_(test::TmpDir()) { + options_.dirname = dbname_ + "/titandb"; + options_.create_if_missing = true; + options_.disable_background_gc = true; + } + ~BlobGCJobTest() {} + + void ClearDir() { + std::vector filenames; + options_.env->GetChildren(options_.dirname, &filenames); + for (auto& fname : filenames) { + if (fname != "." && fname != "..") + ASSERT_OK(options_.env->DeleteFile(options_.dirname + "/" + fname)); + } + ASSERT_OK(options_.env->DeleteDir(options_.dirname)); + filenames.clear(); + options_.env->GetChildren(dbname_, &filenames); + for (auto& fname : filenames) { + if (fname != "." && fname != "..") + ASSERT_OK(options_.env->DeleteFile(dbname_ + "/" + fname)); + } + } + + void NewDB() { + ClearDir(); + ASSERT_OK(TitanDB::Open(options_, dbname_, &db_)); + tdb_ = reinterpret_cast(db_); + version_set_ = tdb_->vset_.get(); + mutex_ = &tdb_->mutex_; + base_db_ = reinterpret_cast(tdb_->GetRootDB()); + } + + void DestoyDB() { + db_->Close(); + } + + void RunGC() { + MutexLock l(mutex_); + Status s; + auto* cfh = base_db_->DefaultColumnFamily(); + + // Build BlobGC + TitanCFOptions cf_options; + cf_options.min_gc_batch_size = 0; + + std::unique_ptr blob_gc; + { + std::shared_ptr blob_gc_picker = + std::make_shared(cf_options); + blob_gc = blob_gc_picker->PickBlobGC( + version_set_->current()->GetBlobStorage(cfh->GetID()).lock().get()); + } + ASSERT_TRUE(blob_gc); + + BlobGCJob blob_gc_job( + blob_gc.get(), base_db_, cfh, mutex_, tdb_->db_options_, + cf_options, tdb_->env_, EnvOptions(), + tdb_->blob_manager_.get(), version_set_); + + s = blob_gc_job.Prepare(); + ASSERT_OK(s); + + { + mutex_->Unlock(); + s = blob_gc_job.Run(); + mutex_->Lock(); + } + ASSERT_OK(s); + + s = blob_gc_job.Finish(); + ASSERT_OK(s); + } + + Status NewIterator(uint64_t file_number, uint64_t file_size, + std::unique_ptr* iter) { + std::unique_ptr file; + Status s = NewBlobFileReader(file_number, 0, tdb_->db_options_, + tdb_->env_options_, tdb_->env_, &file); + // TODO memory leak here + if (!s.ok()) { + return s; + } + iter->reset(new BlobFileIterator(std::move(file), file_number, file_size, + TitanCFOptions())); + return Status::OK(); + } + + void TestDiscardEntry() { + NewDB(); + auto* cfh = base_db_->DefaultColumnFamily(); + BlobIndex blob_index; + blob_index.file_number = 0x81; + blob_index.blob_handle.offset = 0x98; + blob_index.blob_handle.size = 0x17; + std::string res; + blob_index.EncodeTo(&res); + std::string key = "test_discard_entry"; + WriteBatch wb; + ASSERT_OK(WriteBatchInternal::PutBlobIndex(&wb, cfh->GetID(), key, res)); + auto rewrite_status = base_db_->Write(WriteOptions(), &wb); + + BlobGCJob blob_gc_job(nullptr, base_db_, cfh, mutex_, TitanDBOptions(), + TitanCFOptions(), Env::Default(), EnvOptions(), + nullptr, version_set_); + ASSERT_FALSE(blob_gc_job.DiscardEntry(key, blob_index)); + DestoyDB(); + } + + void TestRunGC() { + NewDB(); + for (int i = 0; i < MAX_KEY_NUM; i++) { + std::string key = std::to_string(i); + std::string value(key.data(), 10240); + db_->Put(WriteOptions(), key, value); + } + FlushOptions flush_options; + flush_options.wait = true; + db_->Flush(flush_options); + std::string result; + ASSERT_OK(db_->Get(ReadOptions(), std::to_string(0), &result)); + ASSERT_OK(db_->Get(ReadOptions(), std::to_string(2), &result)); + for (int i = 0; i < MAX_KEY_NUM; i++) { + if (i % 2 != 0) continue; + std::string key = std::to_string(i); + db_->Delete(WriteOptions(), key); + } + db_->Flush(flush_options); + ASSERT_NOK(db_->Get(ReadOptions(), std::to_string(0), &result)); + ASSERT_NOK(db_->Get(ReadOptions(), std::to_string(2), &result)); + Version* v = nullptr; + { + MutexLock l(mutex_); + v = version_set_->current(); + } + ASSERT_TRUE(v != nullptr); + auto b = v->GetBlobStorage(base_db_->DefaultColumnFamily()->GetID()).lock(); + ASSERT_EQ(b->files_.size(), 1); + auto old = b->files_.begin()->first; + for (auto& f : b->files_) { + f.second->marked_for_sample = false; + } + std::unique_ptr iter; + ASSERT_OK(NewIterator(b->files_.begin()->second->file_number, + b->files_.begin()->second->file_size, &iter)); + iter->SeekToFirst(); + for (int i = 0; i < MAX_KEY_NUM; i++, iter->Next()) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + // std::string key = std::to_string(i); + // ASSERT_TRUE(iter->key().size() == key.size()); + // ASSERT_TRUE(iter->key().compare(Slice(key)) == 0); + // fprintf(stderr, "%s, ", iter->key().data()); + } + // fprintf(stderr, "\n\n"); + RunGC(); + { + MutexLock l(mutex_); + v = version_set_->current(); + } + b = v->GetBlobStorage(base_db_->DefaultColumnFamily()->GetID()).lock(); + ASSERT_EQ(b->files_.size(), 1); + auto new1 = b->files_.begin()->first; + ASSERT_TRUE(old != new1); + ASSERT_OK(NewIterator(b->files_.begin()->second->file_number, + b->files_.begin()->second->file_size, &iter)); + iter->SeekToFirst(); + for (int i = 0; i < MAX_KEY_NUM; i++) { + if (i % 2 == 0) continue; + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + // std::string key = std::to_string(i); + // ASSERT_TRUE(iter->key().size() == key.size()); + // ASSERT_TRUE(iter->key().compare(Slice(key)) == 0); + // fprintf(stderr, "%s, ", iter->key().data()); + ASSERT_OK(db_->Get(ReadOptions(), iter->key(), &result)); + ASSERT_TRUE(iter->value().size() == result.size()); + ASSERT_TRUE(iter->value().compare(result) == 0); + iter->Next(); + } + DestoyDB(); + } +}; + +TEST_F(BlobGCJobTest, DiscardEntry) { TestDiscardEntry(); } + +TEST_F(BlobGCJobTest, RunGC) { + TestRunGC(); +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/blob_gc_picker.cc b/utilities/titandb/blob_gc_picker.cc new file mode 100644 index 00000000000..e8b4632edf0 --- /dev/null +++ b/utilities/titandb/blob_gc_picker.cc @@ -0,0 +1,50 @@ +#include "utilities/titandb/blob_gc_picker.h" + +namespace rocksdb { +namespace titandb { + +BasicBlobGCPicker::BasicBlobGCPicker(TitanCFOptions titan_cf_options) + : titan_cf_options_(titan_cf_options) {} + +BasicBlobGCPicker::~BasicBlobGCPicker() {} + +std::unique_ptr BasicBlobGCPicker::PickBlobGC( + BlobStorage* blob_storage) { + Status s; + std::vector blob_files; + + uint64_t batch_size = 0; + for (auto& gc_score : blob_storage->gc_score()) { + auto blob_file = blob_storage->FindFile(gc_score.file_number); + assert(!blob_file.expired()); + + if (!CheckForPick(blob_file.lock().get(), gc_score)) continue; + MarkedForPick(blob_file.lock().get()); + blob_files.push_back(blob_file.lock().get()); + + batch_size += blob_file.lock()->file_size; + if (batch_size >= titan_cf_options_.max_gc_batch_size) break; + } + + if (blob_files.empty() || batch_size < titan_cf_options_.min_gc_batch_size) + return nullptr; + std::unique_ptr blob_gc(new BlobGC(std::move(blob_files))); + + return blob_gc; +} + +bool BasicBlobGCPicker::CheckForPick(BlobFileMeta* blob_file, + const GCScore& gc_score) const { + if (blob_file->being_gc.load(std::memory_order_acquire)) + return false; + if (gc_score.score >= titan_cf_options_.blob_file_discardable_ratio) + blob_file->marked_for_sample = false; + return true; +} + +void BasicBlobGCPicker::MarkedForPick(BlobFileMeta* blob_file) { + blob_file->being_gc = true; +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/blob_gc_picker.h b/utilities/titandb/blob_gc_picker.h new file mode 100644 index 00000000000..ced0c726cd2 --- /dev/null +++ b/utilities/titandb/blob_gc_picker.h @@ -0,0 +1,52 @@ +#ifndef ROCKSDB_BLOB_GC_PICKER_H +#define ROCKSDB_BLOB_GC_PICKER_H + +#include + +#include "db/column_family.h" +#include "db/write_callback.h" +#include "rocksdb/status.h" +#include "util/filename.h" +#include "utilities/titandb/blob_file_manager.h" +#include "utilities/titandb/blob_format.h" +#include "utilities/titandb/blob_gc.h" +#include "utilities/titandb/version.h" + +namespace rocksdb { +namespace titandb { + +class BlobGCPicker { + public: + BlobGCPicker(){}; + virtual ~BlobGCPicker(){}; + + // Pick candidate blob files for a new gc. + // Returns nullptr if there is no gc to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the gc. Caller should delete the result. + virtual std::unique_ptr PickBlobGC(BlobStorage* blob_storage) = 0; +}; + +class BasicBlobGCPicker final : public BlobGCPicker { + public: + BasicBlobGCPicker(TitanCFOptions); + ~BasicBlobGCPicker(); + + std::unique_ptr PickBlobGC(BlobStorage* blob_storage) override; + + private: + TitanCFOptions titan_cf_options_; + + // Check if blob_file needs to gc, return true means we need pick this + // file for gc + bool CheckForPick(BlobFileMeta* blob_file, + const GCScore& gc_score) const; + // Mark that blob_file is picked for gc + // REQUIRE: call CheckForPick first + void MarkedForPick(BlobFileMeta* blob_file); +}; + +} // namespace titandb +} // namespace rocksdb + +#endif // ROCKSDB_BLOB_GC_PICKER_H diff --git a/utilities/titandb/blob_gc_picker_test.cc b/utilities/titandb/blob_gc_picker_test.cc new file mode 100644 index 00000000000..c7de98ebe17 --- /dev/null +++ b/utilities/titandb/blob_gc_picker_test.cc @@ -0,0 +1,76 @@ +#include "utilities/titandb/blob_gc_picker.h" + +#include "util/filename.h" +#include "util/testharness.h" +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_cache.h" +#include "utilities/titandb/blob_file_iterator.h" +#include "utilities/titandb/blob_file_reader.h" +#include "utilities/titandb/version.h" + +namespace rocksdb { +namespace titandb { + +class BlobGCPickerTest : public testing::Test { + public: + std::unique_ptr blob_storage_; + std::unique_ptr basic_blob_gc_picker_; + + BlobGCPickerTest() {} + ~BlobGCPickerTest() {} + + void NewBlobStorageAndPicker(const TitanDBOptions& titan_db_options, + const TitanCFOptions& titan_cf_options) { + auto blob_file_cache = std::make_shared( + titan_db_options, titan_cf_options, NewLRUCache(128)); + blob_storage_.reset(new BlobStorage(titan_cf_options, blob_file_cache)); + basic_blob_gc_picker_.reset(new BasicBlobGCPicker(titan_cf_options)); + } + + void AddBlobFile(uint64_t file_number, uint64_t file_size, + uint64_t discardable_size, bool being_gc = false) { + blob_storage_->files_[file_number] = std::make_shared( + file_number, file_size, discardable_size, being_gc); + } + + void UpdateBlobStorage() { blob_storage_->ComputeGCScore(); } +}; + +TEST_F(BlobGCPickerTest, Basic) { + TitanDBOptions titan_db_options; + TitanCFOptions titan_cf_options; + titan_cf_options.min_gc_batch_size = 0; + NewBlobStorageAndPicker(titan_db_options, titan_cf_options); + AddBlobFile(1U, 1U, 0U); + UpdateBlobStorage(); + auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); + ASSERT_TRUE(blob_gc != nullptr); + ASSERT_EQ(blob_gc->candidate_files().size(), 1); + ASSERT_EQ(blob_gc->candidate_files()[0]->file_number, 1U); +} + +TEST_F(BlobGCPickerTest, BeingGC) { + TitanDBOptions titan_db_options; + TitanCFOptions titan_cf_options; + titan_cf_options.min_gc_batch_size = 0; + NewBlobStorageAndPicker(titan_db_options, titan_cf_options); + AddBlobFile(1U, 1U, 0U, true); + UpdateBlobStorage(); + auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); + ASSERT_EQ(nullptr, blob_gc); + NewBlobStorageAndPicker(titan_db_options, titan_cf_options); + AddBlobFile(1U, 1U, 0U, true); + AddBlobFile(2U, 1U, 0U); + UpdateBlobStorage(); + blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get()); + ASSERT_EQ(blob_gc->candidate_files().size(), 1); + ASSERT_EQ(blob_gc->candidate_files()[0]->file_number, 2U); +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/db.cc b/utilities/titandb/db.cc new file mode 100644 index 00000000000..b860490dd64 --- /dev/null +++ b/utilities/titandb/db.cc @@ -0,0 +1,40 @@ +#include "utilities/titandb/db.h" + +#include "utilities/titandb/db_impl.h" + +namespace rocksdb { +namespace titandb { + +Status TitanDB::Open(const TitanOptions& options, const std::string& dbname, + TitanDB** db) { + TitanDBOptions db_options(options); + TitanCFOptions cf_options(options); + std::vector descs; + descs.emplace_back(kDefaultColumnFamilyName, cf_options); + std::vector handles; + Status s = TitanDB::Open(db_options, dbname, descs, &handles, db); + if (s.ok()) { + assert(handles.size() == 1); + // DBImpl is always holding the default handle. + delete handles[0]; + } + return s; +} + +Status TitanDB::Open(const TitanDBOptions& db_options, + const std::string& dbname, + const std::vector& descs, + std::vector* handles, TitanDB** db) { + auto impl = new TitanDBImpl(db_options, dbname); + auto s = impl->Open(descs, handles); + if (s.ok()) { + *db = impl; + } else { + *db = nullptr; + delete impl; + } + return s; +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/db.h b/utilities/titandb/db.h new file mode 100644 index 00000000000..0942b5b9fcc --- /dev/null +++ b/utilities/titandb/db.h @@ -0,0 +1,61 @@ +#pragma once + +#include "rocksdb/utilities/stackable_db.h" +#include "utilities/titandb/options.h" + +namespace rocksdb { +namespace titandb { + +struct TitanCFDescriptor { + std::string name; + TitanCFOptions options; + TitanCFDescriptor() + : name(kDefaultColumnFamilyName), options(TitanCFOptions()) {} + TitanCFDescriptor(const std::string& _name, const TitanCFOptions& _options) + : name(_name), options(_options) {} +}; + +class TitanDB : public StackableDB { + public: + static Status Open(const TitanOptions& options, const std::string& dbname, + TitanDB** db); + + static Status Open(const TitanDBOptions& db_options, + const std::string& dbname, + const std::vector& descs, + std::vector* handles, TitanDB** db); + + TitanDB() : StackableDB(nullptr) {} + + using StackableDB::CreateColumnFamily; + Status CreateColumnFamily( + const TitanCFDescriptor& desc, ColumnFamilyHandle** handle) { + std::vector handles; + Status s = CreateColumnFamilies({desc}, &handles); + if (s.ok()) { + *handle = handles[0]; + } + return s; + } + + using StackableDB::CreateColumnFamilies; + virtual Status CreateColumnFamilies( + const std::vector& descs, + std::vector* handles) = 0; + + Status DropColumnFamily(ColumnFamilyHandle* handle) override { + return DropColumnFamilies({handle}); + } + + virtual Status DropColumnFamilies( + const std::vector& handles) = 0; + + using StackableDB::Merge; + Status Merge(const WriteOptions&, ColumnFamilyHandle*, const Slice& /*key*/, + const Slice& /*value*/) override { + return Status::NotSupported("TitanDB doesn't support this operation"); + } +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/db_impl.cc b/utilities/titandb/db_impl.cc new file mode 100644 index 00000000000..643ee4201b1 --- /dev/null +++ b/utilities/titandb/db_impl.cc @@ -0,0 +1,372 @@ +#include "utilities/titandb/db_impl.h" + +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_iterator.h" +#include "utilities/titandb/blob_file_size_collector.h" +#include "utilities/titandb/blob_gc.h" +#include "utilities/titandb/db_iter.h" +#include "utilities/titandb/table_factory.h" + +namespace rocksdb { +namespace titandb { + +class TitanDBImpl::FileManager : public BlobFileManager { + public: + FileManager(TitanDBImpl* db) : db_(db) {} + + Status NewFile(std::unique_ptr* handle) override { + auto number = db_->vset_->NewFileNumber(); + auto name = BlobFileName(db_->dirname_, number); + + Status s; + std::unique_ptr file; + { + std::unique_ptr f; + s = db_->env_->NewWritableFile(name, &f, db_->env_options_); + if (!s.ok()) return s; + file.reset(new WritableFileWriter(std::move(f), db_->env_options_)); + } + + handle->reset(new FileHandle(number, name, std::move(file))); + { + MutexLock l(&db_->mutex_); + db_->pending_outputs_.insert(number); + } + return s; + } + + Status BatchFinishFiles( + uint32_t cf_id, + const std::vector, + std::unique_ptr>>& files) + override { + Status s; + VersionEdit edit; + edit.SetColumnFamilyID(cf_id); + for (auto& file : files) { + s = file.second->GetFile()->Sync(false); + if (s.ok()) { + s = file.second->GetFile()->Close(); + } + if (!s.ok()) return s; + + edit.AddBlobFile(file.first); + } + + { + MutexLock l(&db_->mutex_); + s = db_->vset_->LogAndApply(&edit, &db_->mutex_); + for (const auto& file : files) + db_->pending_outputs_.erase(file.second->GetNumber()); + } + return s; + } + + Status BatchDeleteFiles( + const std::vector>& handles) override { + Status s; + for (auto& handle : handles) s = db_->env_->DeleteFile(handle->GetName()); + { + MutexLock l(&db_->mutex_); + for (const auto& handle : handles) + db_->pending_outputs_.erase(handle->GetNumber()); + } + return s; + } + + private: + class FileHandle : public BlobFileHandle { + public: + FileHandle(uint64_t number, const std::string& name, + std::unique_ptr file) + : number_(number), name_(name), file_(std::move(file)) {} + + uint64_t GetNumber() const override { return number_; } + + const std::string& GetName() const override { return name_; } + + WritableFileWriter* GetFile() const override { return file_.get(); } + + private: + uint64_t number_; + std::string name_; + std::unique_ptr file_; + }; + + TitanDBImpl* db_; +}; + +TitanDBImpl::TitanDBImpl(const TitanDBOptions& options, + const std::string& dbname) + : TitanDB(), + mutex_(), + bg_cv_(&mutex_), + dbname_(dbname), + env_(options.env), + env_options_(options), + db_options_(options) { + if (db_options_.dirname.empty()) { + db_options_.dirname = dbname_ + "/titandb"; + } + dirname_ = db_options_.dirname; + vset_.reset(new VersionSet(db_options_)); + blob_manager_.reset(new FileManager(this)); +} + +TitanDBImpl::~TitanDBImpl() { Close(); } + +Status TitanDBImpl::Open(const std::vector& descs, + std::vector* handles) { + // Sets up directories for base DB and TitanDB. + Status s = env_->CreateDirIfMissing(dbname_); + if (!s.ok()) return s; + if (!db_options_.info_log) { + s = CreateLoggerFromOptions(dbname_, db_options_, &db_options_.info_log); + if (!s.ok()) return s; + } + s = env_->CreateDirIfMissing(dirname_); + if (!s.ok()) return s; + s = env_->LockFile(LockFileName(dirname_), &lock_); + if (!s.ok()) return s; + + std::vector base_descs; + for (auto& desc : descs) { + base_descs.emplace_back(desc.name, desc.options); + } + std::map column_families; + + // Opens the base DB first to collect the column families information. + // Avoid flush here because we haven't replaced the table factory yet. + db_options_.avoid_flush_during_recovery = true; + s = DB::Open(db_options_, dbname_, base_descs, handles, &db_); + if (s.ok()) { + for (size_t i = 0; i < descs.size(); i++) { + auto handle = (*handles)[i]; + column_families.emplace(handle->GetID(), descs[i].options); + db_->DestroyColumnFamilyHandle(handle); + // Replaces the provided table factory with TitanTableFactory. + base_descs[i].options.table_factory.reset( + new TitanTableFactory(descs[i].options, blob_manager_)); + + // Add TableProperties for collecting statistics GC + base_descs[i].options.table_properties_collector_factories.emplace_back( + std::make_shared()); + } + handles->clear(); + s = db_->Close(); + delete db_; + } + if (!s.ok()) return s; + + s = vset_->Open(column_families); + if (!s.ok()) return s; + + // Add EventListener to collect statistics for GC + db_options_.listeners.emplace_back( + std::make_shared(this, &this->mutex_, + this->vset_.get())); + + s = DB::Open(db_options_, dbname_, base_descs, handles, &db_); + if (s.ok()) { + db_impl_ = reinterpret_cast(db_->GetRootDB()); + } + return s; +} + +Status TitanDBImpl::Close() { + Status s; + CloseImpl(); + if (db_) { + s = db_->Close(); + delete db_; + db_ = nullptr; + db_impl_ = nullptr; + } + if (lock_) { + env_->UnlockFile(lock_); + lock_ = nullptr; + } + return s; +} + +Status TitanDBImpl::CloseImpl() { + int gc_unscheduled = env_->UnSchedule(this, Env::Priority::LOW); + { + MutexLock l(&mutex_); + bg_gc_scheduled_ -= gc_unscheduled; + while (bg_gc_scheduled_ > 0) { + bg_cv_.Wait(); + } + } + + return Status::OK(); +} + +Status TitanDBImpl::CreateColumnFamilies( + const std::vector& descs, + std::vector* handles) { + std::vector base_descs; + for (auto& desc : descs) { + ColumnFamilyOptions options = desc.options; + // Replaces the provided table factory with TitanTableFactory. + options.table_factory.reset( + new TitanTableFactory(desc.options, blob_manager_)); + base_descs.emplace_back(desc.name, options); + } + Status s = db_impl_->CreateColumnFamilies(base_descs, handles); + if (s.ok()) { + std::map column_families; + for (size_t i = 0; i < descs.size(); i++) { + column_families.emplace((*handles)[i]->GetID(), descs[i].options); + } + MutexLock l(&mutex_); + vset_->AddColumnFamilies(column_families); + } + return s; +} + +Status TitanDBImpl::DropColumnFamilies( + const std::vector& handles) { + std::vector column_families; + for (auto& handle : handles) { + column_families.push_back(handle->GetID()); + } + Status s = db_impl_->DropColumnFamilies(handles); + if (s.ok()) { + MutexLock l(&mutex_); + vset_->DropColumnFamilies(column_families); + } + return s; +} + +Status TitanDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle* handle, + const Slice& key, PinnableSlice* value) { + if (options.snapshot) { + return GetImpl(options, handle, key, value); + } + ReadOptions ro(options); + ManagedSnapshot snapshot(this); + ro.snapshot = snapshot.snapshot(); + return GetImpl(ro, handle, key, value); +} + +Status TitanDBImpl::GetImpl(const ReadOptions& options, + ColumnFamilyHandle* handle, const Slice& key, + PinnableSlice* value) { + auto snap = reinterpret_cast(options.snapshot); + auto storage = snap->current()->GetBlobStorage(handle->GetID()).lock(); + + Status s; + bool is_blob_index = false; + s = db_impl_->GetImpl(options, handle, key, value, nullptr /*value_found*/, + nullptr /*read_callback*/, &is_blob_index); + if (!s.ok() || !is_blob_index) return s; + + BlobIndex index; + s = index.DecodeFrom(value); + if (!s.ok()) return s; + + BlobRecord record; + PinnableSlice buffer; + s = storage->Get(options, index, &record, &buffer); + if (s.ok()) { + value->Reset(); + value->PinSelf(record.value); + } + return s; +} + +std::vector TitanDBImpl::MultiGet( + const ReadOptions& options, const std::vector& handles, + const std::vector& keys, std::vector* values) { + if (options.snapshot) { + return MultiGetImpl(options, handles, keys, values); + } + ReadOptions ro(options); + ManagedSnapshot snapshot(this); + ro.snapshot = snapshot.snapshot(); + return MultiGetImpl(ro, handles, keys, values); +} + +std::vector TitanDBImpl::MultiGetImpl( + const ReadOptions& options, const std::vector& handles, + const std::vector& keys, std::vector* values) { + std::vector res; + res.resize(keys.size()); + values->resize(keys.size()); + for (size_t i = 0; i < keys.size(); i++) { + auto value = &(*values)[i]; + PinnableSlice pinnable_value(value); + res[i] = GetImpl(options, handles[i], keys[i], &pinnable_value); + if (res[i].ok() && pinnable_value.IsPinned()) { + value->assign(pinnable_value.data(), pinnable_value.size()); + } + } + return res; +} + +Iterator* TitanDBImpl::NewIterator(const ReadOptions& options, + ColumnFamilyHandle* handle) { + std::shared_ptr snapshot; + if (options.snapshot) { + return NewIteratorImpl(options, handle, snapshot); + } + ReadOptions ro(options); + snapshot.reset(new ManagedSnapshot(this)); + ro.snapshot = snapshot->snapshot(); + return NewIteratorImpl(ro, handle, snapshot); +} + +Iterator* TitanDBImpl::NewIteratorImpl( + const ReadOptions& options, ColumnFamilyHandle* handle, + std::shared_ptr snapshot) { + auto cfd = reinterpret_cast(handle)->cfd(); + auto snap = reinterpret_cast(options.snapshot); + auto storage = snap->current()->GetBlobStorage(handle->GetID()); + std::unique_ptr iter(db_impl_->NewIteratorImpl( + options, cfd, snap->GetSequenceNumber(), nullptr /*read_callback*/, + true /*allow_blob*/)); + return new TitanDBIterator(options, storage.lock().get(), snapshot, std::move(iter)); +} + +Status TitanDBImpl::NewIterators( + const ReadOptions& options, const std::vector& handles, + std::vector* iterators) { + ReadOptions ro(options); + std::shared_ptr snapshot; + if (!ro.snapshot) { + snapshot.reset(new ManagedSnapshot(this)); + ro.snapshot = snapshot->snapshot(); + } + iterators->clear(); + iterators->reserve(handles.size()); + for (auto& handle : handles) { + iterators->emplace_back(NewIteratorImpl(ro, handle, snapshot)); + } + return Status::OK(); +} + +const Snapshot* TitanDBImpl::GetSnapshot() { + Version* current; + const Snapshot* snapshot; + { + MutexLock l(&mutex_); + current = vset_->current(); + current->Ref(); + snapshot = db_->GetSnapshot(); + } + return new TitanSnapshot(current, snapshot); +} + +void TitanDBImpl::ReleaseSnapshot(const Snapshot* snapshot) { + auto s = reinterpret_cast(snapshot); + { + MutexLock l(&mutex_); + s->current()->Unref(); + db_->ReleaseSnapshot(s->snapshot()); + } + delete s; +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/db_impl.h b/utilities/titandb/db_impl.h new file mode 100644 index 00000000000..be678ec874d --- /dev/null +++ b/utilities/titandb/db_impl.h @@ -0,0 +1,127 @@ +#pragma once + +#include "db/db_impl.h" +#include "utilities/titandb/blob_file_manager.h" +#include "utilities/titandb/db.h" +#include "utilities/titandb/version_set.h" + +namespace rocksdb { +namespace titandb { + +class TitanDBImpl : public TitanDB { + public: + TitanDBImpl(const TitanDBOptions& options, const std::string& dbname); + + ~TitanDBImpl(); + + Status Open(const std::vector& descs, + std::vector* handles); + + Status Close() override; + + using TitanDB::CreateColumnFamilies; + Status CreateColumnFamilies( + const std::vector& descs, + std::vector* handles) override; + + Status DropColumnFamilies( + const std::vector& handles) override; + + Status CloseImpl(); + + using TitanDB::Get; + Status Get(const ReadOptions& options, ColumnFamilyHandle* handle, + const Slice& key, PinnableSlice* value) override; + + using TitanDB::MultiGet; + std::vector MultiGet(const ReadOptions& options, + const std::vector& handles, + const std::vector& keys, + std::vector* values) override; + + using TitanDB::NewIterator; + Iterator* NewIterator(const ReadOptions& options, + ColumnFamilyHandle* handle) override; + + Status NewIterators(const ReadOptions& options, + const std::vector& handles, + std::vector* iterators) override; + + const Snapshot* GetSnapshot() override; + + void ReleaseSnapshot(const Snapshot* snapshot) override; + + private: + class FileManager; + friend class FileManager; + friend class BlobGCJobTest; + friend class BlobDiscardableSizeListener; + + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* handle, + const Slice& key, PinnableSlice* value); + + std::vector MultiGetImpl( + const ReadOptions& options, + const std::vector& handles, + const std::vector& keys, std::vector* values); + + Iterator* NewIteratorImpl(const ReadOptions& options, + ColumnFamilyHandle* handle, + std::shared_ptr snapshot); + + // REQUIRE: mutex_ held + void AddToGCQueue(uint32_t column_family_id) { + if (pending_gc_.find(column_family_id) != pending_gc_.end()) return; + gc_queue_.push_back(column_family_id); + pending_gc_.insert(column_family_id); + } + + // REQUIRE: gc_queue_ not empty + // REQUIRE: mutex_ held + uint32_t PopFirstFromGCQueue() { + assert(!gc_queue_.empty()); + assert(!pending_gc_.empty()); + auto column_family_id = *gc_queue_.begin(); + gc_queue_.pop_front(); + assert(pending_gc_.count(column_family_id) != 0); + pending_gc_.erase(column_family_id); + return column_family_id; + } + + // REQUIRE: mutex_ held + void MaybeScheduleGC(); + + static void BGWorkGC(void* db); + void BackgroundCallGC(); + Status BackgroundGC(); + + // REQUIRES: mutex_ held; + void PurgeObsoleteFiles(); + + FileLock* lock_{nullptr}; + port::Mutex mutex_; + // This condition variable is signaled on these conditions: + // * whenever bg_gc_scheduled_ goes down to 0 + port::CondVar bg_cv_; + + std::string dbname_; + std::string dirname_; + Env* env_; + EnvOptions env_options_; + DBImpl* db_impl_; + TitanDBOptions db_options_; + + std::unique_ptr vset_; + std::set pending_outputs_; + std::shared_ptr blob_manager_; + + // gc_queue_ hold column families that we need to gc. + // pending_gc_ hold column families that already on gc_queue_. + std::deque gc_queue_; + std::set pending_gc_; + + int bg_gc_scheduled_{0}; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/db_impl_files.cc b/utilities/titandb/db_impl_files.cc new file mode 100644 index 00000000000..1a2ed14cab3 --- /dev/null +++ b/utilities/titandb/db_impl_files.cc @@ -0,0 +1,38 @@ +#include "utilities/titandb/db_impl.h" + +namespace rocksdb { +namespace titandb { + +void TitanDBImpl::PurgeObsoleteFiles() { + Status s; + ObsoleteFiles obsolete_files; + vset_->GetObsoleteFiles(&obsolete_files); + + { + mutex_.Unlock(); + std::vector candidate_files; + for (auto& blob_file : obsolete_files.blob_files) { + candidate_files.emplace_back( + BlobFileName(db_options_.dirname, blob_file)); + } + for (auto& manifest : obsolete_files.manifests) { + candidate_files.emplace_back(std::move(manifest)); + } + + // dedup state.candidate_files so we don't try to delete the same + // file twice + std::sort(candidate_files.begin(), candidate_files.end()); + candidate_files.erase( + std::unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); + + for (const auto& candidate_file : candidate_files) { + s = env_->DeleteFile(candidate_file); + assert(s.ok()); + } + mutex_.Lock(); + } +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/db_impl_gc.cc b/utilities/titandb/db_impl_gc.cc new file mode 100644 index 00000000000..df68c7d196b --- /dev/null +++ b/utilities/titandb/db_impl_gc.cc @@ -0,0 +1,84 @@ +#include "utilities/titandb/db_impl.h" + +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_iterator.h" +#include "utilities/titandb/blob_file_size_collector.h" +#include "utilities/titandb/blob_gc.h" +#include "utilities/titandb/blob_gc_job.h" +#include "utilities/titandb/blob_gc_picker.h" +#include "utilities/titandb/db_iter.h" +#include "utilities/titandb/table_factory.h" + +namespace rocksdb { +namespace titandb { + +void TitanDBImpl::MaybeScheduleGC() { + if (db_options_.disable_background_gc) return; + bg_gc_scheduled_++; + env_->Schedule(&TitanDBImpl::BGWorkGC, this, Env::Priority::LOW, this); +} + +void TitanDBImpl::BGWorkGC(void* db) { + reinterpret_cast(db)->BackgroundCallGC(); +} + +void TitanDBImpl::BackgroundCallGC() { + // Is this legal? call bg_cv_.SignalAll() maybe + MutexLock l(&mutex_); + assert(bg_gc_scheduled_ > 0); + BackgroundGC(); + + PurgeObsoleteFiles(); + + bg_gc_scheduled_--; + if (bg_gc_scheduled_ == 0) { + // signal if + // * bg_gc_scheduled_ == 0 -- need to wakeup ~TitanDBImpl + // If none of this is true, there is no need to signal since nobody is + // waiting for it + bg_cv_.SignalAll(); + } + // IMPORTANT: there should be no code after calling SignalAll. This call may + // signal the DB destructor that it's OK to proceed with destruction. In + // that case, all DB variables will be dealloacated and referencing them + // will cause trouble. +} + +Status TitanDBImpl::BackgroundGC() { + Status s; + if (!gc_queue_.empty()) { + uint32_t column_family_id = PopFirstFromGCQueue(); + auto* cfh = db_impl_->GetColumnFamilyHandleUnlocked(column_family_id); + assert(cfh != nullptr); + + std::unique_ptr blob_gc; + auto current = vset_->current(); + auto bs = current->GetBlobStorage(column_family_id).lock().get(); + const auto& titan_cf_options = bs->titan_cf_options(); + std::shared_ptr blob_gc_picker = + std::make_shared(titan_cf_options); + blob_gc = blob_gc_picker->PickBlobGC(bs); + if (!blob_gc) return Status::Corruption("Build BlobGC failed"); + + BlobGCJob blob_gc_job(blob_gc.get(), db_, cfh, &mutex_, db_options_, + titan_cf_options, env_, + env_options_, blob_manager_.get(), vset_.get()); + s = blob_gc_job.Prepare(); + if (!s.ok()) return s; + + { + mutex_.Unlock(); + s = blob_gc_job.Run(); + mutex_.Lock(); + } + if (!s.ok()) return s; + + s = blob_gc_job.Finish(); + if (!s.ok()) return s; + } + + return Status::OK(); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/db_iter.h b/utilities/titandb/db_iter.h new file mode 100644 index 00000000000..de6a3e4aeb5 --- /dev/null +++ b/utilities/titandb/db_iter.h @@ -0,0 +1,119 @@ +#pragma once + +#include "db/db_iter.h" +#include "utilities/titandb/version.h" + +namespace rocksdb { +namespace titandb { + +// Wraps the current version together with the snapshot from base DB +// so that we can safely recycle a steal version when it is dropped. +// This also implies a guarantee that the current version must contain +// all the data accessible from base DB. +class TitanSnapshot : public Snapshot { + public: + TitanSnapshot(Version* _current, const Snapshot* _snapshot) + : current_(_current), snapshot_(_snapshot) {} + + Version* current() const { return current_; } + + const Snapshot* snapshot() const { return snapshot_; } + + SequenceNumber GetSequenceNumber() const override { + return snapshot_->GetSequenceNumber(); + } + + private: + Version* current_; + const Snapshot* snapshot_; +}; + +class TitanDBIterator : public Iterator { + public: + TitanDBIterator(const ReadOptions& options, + BlobStorage* storage, + std::shared_ptr snap, + std::unique_ptr iter) + : options_(options), + storage_(storage), + snap_(snap), + iter_(std::move(iter)) {} + + bool Valid() const override { return iter_->Valid() && status_.ok(); } + + Status status() const override { + Status s = iter_->status(); + if (s.ok()) s = status_; + return s; + } + + void SeekToFirst() override { + iter_->SeekToFirst(); + GetBlobValue(); + } + + void SeekToLast() override { + iter_->SeekToLast(); + GetBlobValue(); + } + + void Seek(const Slice& target) override { + iter_->Seek(target); + GetBlobValue(); + } + + void SeekForPrev(const Slice& target) override { + iter_->SeekForPrev(target); + GetBlobValue(); + } + + void Next() override { + iter_->Next(); + GetBlobValue(); + } + + void Prev() override { + iter_->Prev(); + GetBlobValue(); + } + + Slice key() const override { return iter_->key(); } + + Slice value() const override { + if (!iter_->IsBlob()) return iter_->value(); + return record_.value; + } + + private: + void GetBlobValue() { + if (!iter_->Valid() || !iter_->IsBlob()) return; + + BlobIndex index; + status_ = DecodeInto(iter_->value(), &index); + if (!status_.ok()) return; + + auto it = files_.find(index.file_number); + if (it == files_.end()) { + std::unique_ptr prefetcher; + status_ = storage_->NewPrefetcher(index.file_number, &prefetcher); + if (!status_.ok()) return; + it = files_.emplace(index.file_number, std::move(prefetcher)).first; + } + + buffer_.Reset(); + status_ = it->second->Get(options_, index.blob_handle, &record_, &buffer_); + } + + Status status_; + BlobRecord record_; + PinnableSlice buffer_; + + ReadOptions options_; + BlobStorage* storage_; + std::shared_ptr snap_; + std::unique_ptr iter_; + std::map> files_; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/options.cc b/utilities/titandb/options.cc new file mode 100644 index 00000000000..001bb0dff8f --- /dev/null +++ b/utilities/titandb/options.cc @@ -0,0 +1,26 @@ +#include "utilities/titandb/options.h" + +#include + +#include "rocksdb/convenience.h" + +namespace rocksdb { +namespace titandb { + +std::string TitanCFOptions::ToString() const { + char buf[256]; + std::string str; + std::string res = "[titandb]\n"; + snprintf(buf, sizeof(buf), "min_blob_size = %" PRIu64 "\n", min_blob_size); + res += buf; + GetStringFromCompressionType(&str, blob_file_compression); + snprintf(buf, sizeof(buf), "blob_file_compression = %s\n", str.c_str()); + res += buf; + snprintf(buf, sizeof(buf), "blob_file_target_size = %" PRIu64 "\n", + blob_file_target_size); + res += buf; + return res; +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/options.h b/utilities/titandb/options.h new file mode 100644 index 00000000000..9e56631b178 --- /dev/null +++ b/utilities/titandb/options.h @@ -0,0 +1,81 @@ +#pragma once + +#include "rocksdb/options.h" + +namespace rocksdb { +namespace titandb { + +struct TitanDBOptions : public DBOptions { + // The directory to store data specific to TitanDB alongside with + // the base DB. + // + // Default: {dbname}/titandb + std::string dirname; + + // Disable background GC + // + // Default: true + bool disable_background_gc{false}; + + TitanDBOptions() = default; + explicit TitanDBOptions(const DBOptions& options) : DBOptions(options) {} +}; + +struct TitanCFOptions : public ColumnFamilyOptions { + // The smallest value to store in blob files. Value smaller than + // this threshold will be inlined in base DB. + // + // Default: 4096 + uint64_t min_blob_size{4096}; + + // The compression algorithm used to compress data in blob files. + // + // Default: kNoCompression + CompressionType blob_file_compression{kNoCompression}; + + // The desirable blob file size. This is not a hard limit but a wish. + // + // Default: 256MB + uint64_t blob_file_target_size{256 << 20}; + + // If non-NULL use the specified cache for blob records. + // + // Default: nullptr + std::shared_ptr blob_cache; + + // Max batch size for gc + // + // Default: 1GB + uint64_t max_gc_batch_size{1 << 30}; + + // Min batch size for gc + // + // Default: 512MB + uint64_t min_gc_batch_size{512 << 20}; + + // The ratio of how much discardable size of a blob file can be GC + // + // Default: 0.5 + float blob_file_discardable_ratio{0.5}; + + // The ratio of how much size of a blob file need to be sample before GC + // + // Default: 0.1 + float sample_flie_size_ratio{0.1}; + + // The blob file size less than this option will be mark gc + // + // Default: 8MB + uint64_t merge_small_file_threashold{8 << 20}; + + TitanCFOptions() = default; + explicit TitanCFOptions(const ColumnFamilyOptions& options) + : ColumnFamilyOptions(options) {} + + std::string ToString() const; +}; + +struct TitanOptions : public TitanDBOptions, public TitanCFOptions {}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/table_builder.cc b/utilities/titandb/table_builder.cc new file mode 100644 index 00000000000..a00b5d2d867 --- /dev/null +++ b/utilities/titandb/table_builder.cc @@ -0,0 +1,104 @@ +#include "utilities/titandb/table_builder.h" + +namespace rocksdb { +namespace titandb { + +void TitanTableBuilder::Add(const Slice& key, const Slice& value) { + if (!ok()) return; + + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + status_ = Status::Corruption(Slice()); + return; + } + + if (ikey.type != kTypeValue || value.size() < options_.min_blob_size) { + base_builder_->Add(key, value); + return; + } + + std::string index_value; + AddBlob(ikey.user_key, value, &index_value); + if (!ok()) return; + + ikey.type = kTypeBlobIndex; + std::string index_key; + AppendInternalKey(&index_key, ikey); + base_builder_->Add(index_key, index_value); +} + +void TitanTableBuilder::AddBlob(const Slice& key, const Slice& value, + std::string* index_value) { + if (!ok()) return; + + if (!blob_builder_) { + status_ = blob_manager_->NewFile(&blob_handle_); + if (!ok()) return; + blob_builder_.reset(new BlobFileBuilder(options_, blob_handle_->GetFile())); + } + + BlobIndex index; + BlobRecord record; + record.key = key; + record.value = value; + index.file_number = blob_handle_->GetNumber(); + blob_builder_->Add(record, &index.blob_handle); + if (ok()) { + index.EncodeTo(index_value); + } +} + +Status TitanTableBuilder::status() const { + Status s = status_; + if (s.ok()) { + s = base_builder_->status(); + } + if (s.ok() && blob_builder_) { + s = blob_builder_->status(); + } + return s; +} + +Status TitanTableBuilder::Finish() { + base_builder_->Finish(); + if (blob_builder_) { + blob_builder_->Finish(); + if (ok()) { + std::shared_ptr file = std::make_shared(); + file->file_number = blob_handle_->GetNumber(); + file->file_size = blob_handle_->GetFile()->GetFileSize(); + status_ = + blob_manager_->FinishFile(cf_id_, file, std::move(blob_handle_)); + } else { + status_ = blob_manager_->DeleteFile(std::move(blob_handle_)); + } + } + return status(); +} + +void TitanTableBuilder::Abandon() { + base_builder_->Abandon(); + if (blob_builder_) { + blob_builder_->Abandon(); + status_ = blob_manager_->DeleteFile(std::move(blob_handle_)); + } +} + +uint64_t TitanTableBuilder::NumEntries() const { + return base_builder_->NumEntries(); +} + +uint64_t TitanTableBuilder::FileSize() const { + return base_builder_->FileSize(); +} + +bool TitanTableBuilder::NeedCompact() const { + return base_builder_->NeedCompact(); +} + +TableProperties TitanTableBuilder::GetTableProperties() const { + return base_builder_->GetTableProperties(); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/table_builder.h b/utilities/titandb/table_builder.h new file mode 100644 index 00000000000..296648765fa --- /dev/null +++ b/utilities/titandb/table_builder.h @@ -0,0 +1,52 @@ +#pragma once + +#include "table/table_builder.h" +#include "utilities/titandb/blob_file_builder.h" +#include "utilities/titandb/blob_file_manager.h" +#include "utilities/titandb/options.h" + +namespace rocksdb { +namespace titandb { + +class TitanTableBuilder : public TableBuilder { + public: + TitanTableBuilder(uint32_t cf_id, const TitanCFOptions& options, + std::unique_ptr base_builder, + std::shared_ptr blob_manager) + : cf_id_(cf_id), + options_(options), + base_builder_(std::move(base_builder)), + blob_manager_(blob_manager) {} + + void Add(const Slice& key, const Slice& value) override; + + Status status() const override; + + Status Finish() override; + + void Abandon() override; + + uint64_t NumEntries() const override; + + uint64_t FileSize() const override; + + bool NeedCompact() const override; + + TableProperties GetTableProperties() const override; + + private: + bool ok() const { return status().ok(); } + + void AddBlob(const Slice& key, const Slice& value, std::string* index_value); + + Status status_; + uint32_t cf_id_; + TitanCFOptions options_; + std::unique_ptr base_builder_; + std::unique_ptr blob_handle_; + std::shared_ptr blob_manager_; + std::unique_ptr blob_builder_; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/table_builder_test.cc b/utilities/titandb/table_builder_test.cc new file mode 100644 index 00000000000..14172988185 --- /dev/null +++ b/utilities/titandb/table_builder_test.cc @@ -0,0 +1,290 @@ +#include "table/table_builder.h" +#include "table/table_reader.h" +#include "util/filename.h" +#include "util/testharness.h" +#include "utilities/titandb/blob_file_manager.h" +#include "utilities/titandb/blob_file_reader.h" +#include "utilities/titandb/table_factory.h" + +namespace rocksdb { +namespace titandb { + +const uint64_t kMinBlobSize = 128; +const uint64_t kTestFileNumber = 123; + +class FileManager : public BlobFileManager { + public: + FileManager(const TitanDBOptions& db_options) : db_options_(db_options) {} + + Status NewFile(std::unique_ptr* handle) override { + auto number = kTestFileNumber; + auto name = BlobFileName(db_options_.dirname, number); + std::unique_ptr file; + { + std::unique_ptr f; + Status s = env_->NewWritableFile(name, &f, env_options_); + if (!s.ok()) return s; + file.reset(new WritableFileWriter(std::move(f), env_options_)); + } + handle->reset(new FileHandle(number, name, std::move(file))); + return Status::OK(); + } + + Status FinishFile(uint32_t /*cf_id*/, std::shared_ptr /*file*/, + std::unique_ptr&& handle) override { + Status s = handle->GetFile()->Sync(true); + if (s.ok()) { + s = handle->GetFile()->Close(); + } + return s; + } + + Status DeleteFile(std::unique_ptr&& handle) override { + return env_->DeleteFile(handle->GetName()); + } + + private: + class FileHandle : public BlobFileHandle { + public: + FileHandle(uint64_t number, const std::string& name, + std::unique_ptr file) + : number_(number), name_(name), file_(std::move(file)) {} + + uint64_t GetNumber() const override { return number_; } + + const std::string& GetName() const override { return name_; } + + WritableFileWriter* GetFile() const override { return file_.get(); } + + private: + friend class FileManager; + + uint64_t number_; + std::string name_; + std::unique_ptr file_; + }; + + Env* env_{Env::Default()}; + EnvOptions env_options_; + TitanDBOptions db_options_; +}; + +class TableBuilderTest : public testing::Test { + public: + TableBuilderTest() + : cf_moptions_(cf_options_), + cf_ioptions_(options_), + tmpdir_(test::TmpDir(env_)), + base_name_(tmpdir_ + "/base"), + blob_name_(BlobFileName(tmpdir_, kTestFileNumber)) { + db_options_.dirname = tmpdir_; + cf_options_.min_blob_size = kMinBlobSize; + blob_manager_.reset(new FileManager(db_options_)); + table_factory_.reset(new TitanTableFactory(cf_options_, blob_manager_)); + } + + ~TableBuilderTest() { + env_->DeleteFile(base_name_); + env_->DeleteFile(blob_name_); + env_->DeleteDir(tmpdir_); + } + + void BlobFileExists(bool exists) { + Status s = env_->FileExists(blob_name_); + if (exists) { + ASSERT_TRUE(s.ok()); + } else { + ASSERT_TRUE(s.IsNotFound()); + } + } + + void NewFileWriter(const std::string& fname, + std::unique_ptr* result) { + std::unique_ptr file; + ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); + result->reset(new WritableFileWriter(std::move(file), env_options_)); + } + + void NewFileReader(const std::string& fname, + std::unique_ptr* result) { + std::unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, env_options_)); + result->reset(new RandomAccessFileReader(std::move(file), fname, env_)); + } + + void NewBaseFileWriter(std::unique_ptr* result) { + NewFileWriter(base_name_, result); + } + + void NewBaseFileReader(std::unique_ptr* result) { + NewFileReader(base_name_, result); + } + + void NewBlobFileReader(std::unique_ptr* result) { + std::unique_ptr file; + NewFileReader(blob_name_, &file); + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(blob_name_, &file_size)); + ASSERT_OK( + BlobFileReader::Open(cf_options_, std::move(file), file_size, result)); + } + + void NewTableReader(std::unique_ptr* result) { + std::unique_ptr file; + NewBaseFileReader(&file); + uint64_t file_size = 0; + ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size)); + TableReaderOptions options(cf_ioptions_, env_options_, + cf_ioptions_.internal_comparator); + ASSERT_OK(table_factory_->NewTableReader(options, std::move(file), + file_size, result)); + } + + void NewTableBuilder(WritableFileWriter* file, + std::unique_ptr* result) { + TableBuilderOptions options(cf_ioptions_, cf_ioptions_.internal_comparator, + &collectors_, kNoCompression, + CompressionOptions(), nullptr, false, + kDefaultColumnFamilyName, 0); + result->reset(table_factory_->NewTableBuilder(options, 0, file)); + } + + Env* env_{Env::Default()}; + EnvOptions env_options_; + Options options_; + TitanDBOptions db_options_; + TitanCFOptions cf_options_; + MutableCFOptions cf_moptions_; + ImmutableCFOptions cf_ioptions_; + std::vector> collectors_; + + std::string tmpdir_; + std::string base_name_; + std::string blob_name_; + std::unique_ptr table_factory_; + std::shared_ptr blob_manager_; +}; + +TEST_F(TableBuilderTest, Basic) { + std::unique_ptr base_file; + NewBaseFileWriter(&base_file); + std::unique_ptr table_builder; + NewTableBuilder(base_file.get(), &table_builder); + + // Build a base table and a blob file. + const int n = 100; + for (char i = 0; i < n; i++) { + std::string key(1, i); + InternalKey ikey(key, 1, kTypeValue); + std::string value; + if (i % 2 == 0) { + value = std::string(1, i); + } else { + value = std::string(kMinBlobSize, i); + } + table_builder->Add(ikey.Encode(), value); + } + ASSERT_OK(table_builder->Finish()); + ASSERT_OK(base_file->Sync(true)); + ASSERT_OK(base_file->Close()); + + std::unique_ptr base_reader; + NewTableReader(&base_reader); + std::unique_ptr blob_reader; + NewBlobFileReader(&blob_reader); + + ReadOptions ro; + std::unique_ptr iter; + iter.reset(base_reader->NewIterator(ro, nullptr)); + iter->SeekToFirst(); + for (char i = 0; i < n; i++) { + ASSERT_TRUE(iter->Valid()); + std::string key(1, i); + ParsedInternalKey ikey; + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + ASSERT_EQ(ikey.user_key, key); + if (i % 2 == 0) { + ASSERT_EQ(ikey.type, kTypeValue); + ASSERT_EQ(iter->value(), std::string(1, i)); + } else { + ASSERT_EQ(ikey.type, kTypeBlobIndex); + BlobIndex index; + ASSERT_OK(DecodeInto(iter->value(), &index)); + ASSERT_EQ(index.file_number, kTestFileNumber); + BlobRecord record; + PinnableSlice buffer; + ASSERT_OK(blob_reader->Get(ro, index.blob_handle, &record, &buffer)); + ASSERT_EQ(record.key, key); + ASSERT_EQ(record.value, std::string(kMinBlobSize, i)); + } + iter->Next(); + } +} + +TEST_F(TableBuilderTest, NoBlob) { + std::unique_ptr base_file; + NewBaseFileWriter(&base_file); + std::unique_ptr table_builder; + NewTableBuilder(base_file.get(), &table_builder); + + const int n = 100; + for (char i = 0; i < n; i++) { + std::string key(1, i); + InternalKey ikey(key, 1, kTypeValue); + std::string value(1, i); + table_builder->Add(ikey.Encode(), value); + } + ASSERT_OK(table_builder->Finish()); + ASSERT_OK(base_file->Sync(true)); + ASSERT_OK(base_file->Close()); + BlobFileExists(false); + + std::unique_ptr base_reader; + NewTableReader(&base_reader); + + ReadOptions ro; + std::unique_ptr iter; + iter.reset(base_reader->NewIterator(ro, nullptr)); + iter->SeekToFirst(); + for (char i = 0; i < n; i++) { + ASSERT_TRUE(iter->Valid()); + std::string key(1, i); + ParsedInternalKey ikey; + ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); + ASSERT_EQ(ikey.user_key, key); + ASSERT_EQ(ikey.type, kTypeValue); + ASSERT_EQ(iter->value(), std::string(1, i)); + iter->Next(); + } +} + +TEST_F(TableBuilderTest, Abandon) { + std::unique_ptr base_file; + NewBaseFileWriter(&base_file); + std::unique_ptr table_builder; + NewTableBuilder(base_file.get(), &table_builder); + + const int n = 100; + for (char i = 0; i < n; i++) { + std::string key(1, i); + InternalKey ikey(key, 1, kTypeValue); + std::string value; + if (i % 2 == 0) { + value = std::string(1, i); + } else { + value = std::string(kMinBlobSize, i); + } + table_builder->Add(ikey.Encode(), value); + } + BlobFileExists(true); + table_builder->Abandon(); + BlobFileExists(false); +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/table_factory.cc b/utilities/titandb/table_factory.cc new file mode 100644 index 00000000000..5297041088b --- /dev/null +++ b/utilities/titandb/table_factory.cc @@ -0,0 +1,32 @@ +#include "utilities/titandb/table_factory.h" + +#include "utilities/titandb/table_builder.h" + +namespace rocksdb { +namespace titandb { + +Status TitanTableFactory::NewTableReader( + const TableReaderOptions& options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* result, + bool prefetch_index_and_filter_in_cache) const { + return base_factory_->NewTableReader(options, std::move(file), file_size, + result, + prefetch_index_and_filter_in_cache); +} + +TableBuilder* TitanTableFactory::NewTableBuilder( + const TableBuilderOptions& options, uint32_t column_family_id, + WritableFileWriter* file) const { + std::unique_ptr base_builder( + base_factory_->NewTableBuilder(options, column_family_id, file)); + return new TitanTableBuilder(column_family_id, options_, + std::move(base_builder), blob_manager_); +} + +std::string TitanTableFactory::GetPrintableTableOptions() const { + return base_factory_->GetPrintableTableOptions() + options_.ToString(); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/table_factory.h b/utilities/titandb/table_factory.h new file mode 100644 index 00000000000..b9b7226360c --- /dev/null +++ b/utilities/titandb/table_factory.h @@ -0,0 +1,57 @@ +#pragma once + +#include "rocksdb/table.h" +#include "utilities/titandb/blob_file_manager.h" +#include "utilities/titandb/options.h" + +namespace rocksdb { +namespace titandb { + +class TitanTableFactory : public TableFactory { + public: + TitanTableFactory(const TitanCFOptions& options, + std::shared_ptr blob_manager) + : options_(options), + base_factory_(options.table_factory), + blob_manager_(blob_manager) {} + + const char* Name() const override { return "TitanTable"; } + + Status NewTableReader( + const TableReaderOptions& options, + std::unique_ptr&& file, uint64_t file_size, + std::unique_ptr* result, + bool prefetch_index_and_filter_in_cache = true) const override; + + TableBuilder* NewTableBuilder(const TableBuilderOptions& options, + uint32_t column_family_id, + WritableFileWriter* file) const override; + + std::string GetPrintableTableOptions() const override; + + Status SanitizeOptions(const DBOptions& db_options, + const ColumnFamilyOptions& cf_options) const override { + // Override this when we need to validate our options. + return base_factory_->SanitizeOptions(db_options, cf_options); + } + + Status GetOptionString(std::string* opt_string, + const std::string& delimiter) const override { + // Override this when we need to persist our options. + return base_factory_->GetOptionString(opt_string, delimiter); + } + + void* GetOptions() override { return base_factory_->GetOptions(); } + + bool IsDeleteRangeSupported() const override { + return base_factory_->IsDeleteRangeSupported(); + } + + private: + TitanCFOptions options_; + std::shared_ptr base_factory_; + std::shared_ptr blob_manager_; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/titan_db_test.cc b/utilities/titandb/titan_db_test.cc new file mode 100644 index 00000000000..3ebf10dd1e4 --- /dev/null +++ b/utilities/titandb/titan_db_test.cc @@ -0,0 +1,199 @@ +#include + +#include "util/filename.h" +#include "util/random.h" +#include "util/testharness.h" +#include "utilities/titandb/db.h" + +namespace rocksdb { +namespace titandb { + +void DeleteDir(Env* env, const std::string& dirname) { + std::vector filenames; + env->GetChildren(dirname, &filenames); + for (auto& fname : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(fname, &number, &type)) { + ASSERT_OK(env->DeleteFile(dirname + "/" + fname)); + } + } + env->DeleteDir(dirname); +} + +class TitanDBTest : public testing::Test { + public: + TitanDBTest() : dbname_(test::TmpDir()) { + options_.dirname = dbname_ + "/titandb"; + options_.create_if_missing = true; + options_.min_blob_size = 32; + options_.min_gc_batch_size = 1; + DeleteDir(env_, options_.dirname); + DeleteDir(env_, dbname_); + } + + ~TitanDBTest() { + Close(); + } + + void Open() { + if (cf_names_.empty()) { + ASSERT_OK(TitanDB::Open(options_, dbname_, &db_)); + } else { + TitanDBOptions db_options(options_); + TitanCFOptions cf_options(options_); + cf_names_.clear(); + ASSERT_OK(DB::ListColumnFamilies(db_options, dbname_, &cf_names_)); + std::vector descs; + for (auto& name : cf_names_) { + descs.emplace_back(name, cf_options); + } + cf_handles_.clear(); + ASSERT_OK(TitanDB::Open(db_options, dbname_, descs, &cf_handles_, &db_)); + } + } + + void Close() { + if (!db_) return; + for (auto& handle : cf_handles_) { + db_->DestroyColumnFamilyHandle(handle); + } + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; + } + + void Reopen() { + Close(); + Open(); + } + + void AddCF(const std::string& name) { + TitanCFDescriptor desc(name, options_); + ColumnFamilyHandle* handle = nullptr; + ASSERT_OK(db_->CreateColumnFamily(desc, &handle)); + cf_names_.emplace_back(name); + cf_handles_.emplace_back(handle); + } + + void DropCF(const std::string& name) { + for (size_t i = 0; i < cf_names_.size(); i++) { + if (cf_names_[i] != name) continue; + auto handle = cf_handles_[i]; + ASSERT_OK(db_->DropColumnFamily(handle)); + db_->DestroyColumnFamilyHandle(handle); + cf_names_.erase(cf_names_.begin() + i); + cf_handles_.erase(cf_handles_.begin() + i); + break; + } + } + + void Put(uint64_t k, std::map* data = nullptr) { + WriteOptions wopts; + std::string key = GenKey(k); + std::string value = GenValue(k); + ASSERT_OK(db_->Put(wopts, key, value)); + for (auto& handle : cf_handles_) { + ASSERT_OK(db_->Put(wopts, handle, key, value)); + } + if (data != nullptr) { + data->emplace(key, value); + } + } + + void Flush() { + FlushOptions fopts; + ASSERT_OK(db_->Flush(fopts)); + for (auto& handle : cf_handles_) { + ASSERT_OK(db_->Flush(fopts, handle)); + } + } + + void VerifyDB(const std::map& data) { + ReadOptions ropts; + + for (auto& kv : data) { + std::string value; + ASSERT_OK(db_->Get(ropts, kv.first, &value)); + ASSERT_EQ(value, kv.second); + for (auto& handle : cf_handles_) { + ASSERT_OK(db_->Get(ropts, handle, kv.first, &value)); + ASSERT_EQ(value, kv.second); + } + std::vector keys(cf_handles_.size(), kv.first); + std::vector values; + auto res = db_->MultiGet(ropts, cf_handles_, keys, &values); + for (auto& s : res) ASSERT_OK(s); + for (auto& v : values) ASSERT_EQ(v, kv.second); + } + + std::vector iterators; + db_->NewIterators(ropts, cf_handles_, &iterators); + iterators.emplace_back(db_->NewIterator(ropts)); + for (auto& handle : cf_handles_) { + iterators.emplace_back(db_->NewIterator(ropts, handle)); + } + for (auto& iter : iterators) { + iter->SeekToFirst(); + for (auto& kv : data) { + ASSERT_EQ(iter->Valid(), true); + ASSERT_EQ(iter->key(), kv.first); + ASSERT_EQ(iter->value(), kv.second); + iter->Next(); + } + delete iter; + } + } + + std::string GenKey(uint64_t i) { + char buf[64]; + snprintf(buf, sizeof(buf), "k-%08" PRIu64, i); + return buf; + } + + std::string GenValue(uint64_t k) { + if (k % 2 == 0) { + return std::string(options_.min_blob_size - 1, 'v'); + } else { + return std::string(options_.min_blob_size + 1, 'v'); + } + } + + Env* env_ {Env::Default()}; + std::string dbname_; + TitanOptions options_; + TitanDB* db_{nullptr}; + std::vector cf_names_; + std::vector cf_handles_; +}; + +TEST_F(TitanDBTest, Basic) { + const uint64_t kNumKeys = 100; + std::map data; + for (auto i = 0; i < 6; i++) { + if (i == 0) { + Open(); + } else { + Reopen(); + VerifyDB(data); + AddCF(std::to_string(i)); + if (i % 3 == 0) { + DropCF(std::to_string(i-1)); + DropCF(std::to_string(i-2)); + } + } + for (uint64_t k = 1; k <= kNumKeys; k++) { + Put(k, &data); + } + Flush(); + VerifyDB(data); + } +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/util.cc b/utilities/titandb/util.cc new file mode 100644 index 00000000000..6d51e9a858d --- /dev/null +++ b/utilities/titandb/util.cc @@ -0,0 +1,161 @@ +#include "utilities/titandb/util.h" + +namespace rocksdb { +namespace titandb { + +// See util/compression.h. +const uint32_t kCompressionFormat = 2; + +bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { + // Check to see if compressed less than 12.5% + return compressed_size < raw_size - (raw_size / 8u); +} + +Slice Compress(CompressionType* type, const Slice& input, std::string* output) { + if (*type == kNoCompression) { + return input; + } + + // TODO: use a configurable options. + CompressionOptions opts; + + // Returns compressed block contents if: + // (1) the compression method is supported in this platform and + // (2) the compression rate is "good enough". + switch (*type) { + case kSnappyCompression: + if (Snappy_Compress(opts, input.data(), input.size(), output) && + GoodCompressionRatio(output->size(), input.size())) { + return *output; + } + break; + case kZlibCompression: + if (Zlib_Compress(opts, kCompressionFormat, input.data(), input.size(), + output) && + GoodCompressionRatio(output->size(), input.size())) { + return *output; + } + break; + case kBZip2Compression: + if (BZip2_Compress(opts, kCompressionFormat, input.data(), input.size(), + output) && + GoodCompressionRatio(output->size(), input.size())) { + return *output; + } + break; + case kLZ4Compression: + if (LZ4_Compress(opts, kCompressionFormat, input.data(), input.size(), + output) && + GoodCompressionRatio(output->size(), input.size())) { + return *output; + } + break; + case kLZ4HCCompression: + if (LZ4HC_Compress(opts, kCompressionFormat, input.data(), input.size(), + output) && + GoodCompressionRatio(output->size(), input.size())) { + return *output; + } + break; + case kXpressCompression: + if (XPRESS_Compress(input.data(), input.size(), output) && + GoodCompressionRatio(output->size(), input.size())) { + return *output; + } + break; + case kZSTD: + case kZSTDNotFinalCompression: + if (ZSTD_Compress(opts, input.data(), input.size(), output) && + GoodCompressionRatio(output->size(), input.size())) { + return *output; + } + break; + default: {} // Do not recognize this compression type + } + + // Compression method is not supported, or not good compression + // ratio, so just fall back to uncompressed form. + *type = kNoCompression; + return input; +} + +Status Uncompress(CompressionType type, const Slice& input, Slice* output, + std::unique_ptr* buffer) { + int size = 0; + assert(type != kNoCompression); + + switch (type) { + case kSnappyCompression: { + size_t usize = 0; + if (!Snappy_GetUncompressedLength(input.data(), input.size(), &usize)) { + return Status::Corruption("Corrupted compressed blob", "Snappy"); + } + buffer->reset(new char[usize]); + if (!Snappy_Uncompress(input.data(), input.size(), buffer->get())) { + return Status::Corruption("Corrupted compressed blob", "Snappy"); + } + *output = Slice(buffer->get(), usize); + break; + } + case kZlibCompression: + buffer->reset(Zlib_Uncompress(input.data(), input.size(), &size, + kCompressionFormat)); + if (!buffer->get()) { + return Status::Corruption("Corrupted compressed blob", "Zlib"); + } + *output = Slice(buffer->get(), size); + break; + case kBZip2Compression: + buffer->reset(BZip2_Uncompress( + input.data(), input.size(), &size, kCompressionFormat)); + if (!buffer->get()) { + return Status::Corruption("Corrupted compressed blob", "Bzip2"); + } + *output = Slice(buffer->get(), size); + break; + case kLZ4Compression: + buffer->reset(LZ4_Uncompress(input.data(), input.size(), &size, + kCompressionFormat)); + if (!buffer->get()) { + return Status::Corruption("Corrupted compressed blob", "LZ4"); + } + *output = Slice(buffer->get(), size); + break; + case kLZ4HCCompression: + buffer->reset(LZ4_Uncompress(input.data(), input.size(), &size, + kCompressionFormat)); + if (!buffer->get()) { + return Status::Corruption("Corrupted compressed blob", "LZ4HC"); + } + *output = Slice(buffer->get(), size); + break; + case kXpressCompression: + buffer->reset(XPRESS_Uncompress(input.data(), input.size(), &size)); + if (!buffer->get()) { + return Status::Corruption("Corrupted compressed blob", "Xpress"); + } + *output = Slice(buffer->get(), size); + break; + case kZSTD: + case kZSTDNotFinalCompression: + buffer->reset(ZSTD_Uncompress(input.data(), input.size(), &size)); + if (!buffer->get()) { + return Status::Corruption("Corrupted compressed blob", "ZSTD"); + } + *output = Slice(buffer->get(), size); + break; + default: + return Status::Corruption("bad compression type"); + } + + return Status::OK(); +} + +void UnrefCacheHandle(void* arg1, void* arg2) { + Cache* cache = reinterpret_cast(arg1); + Cache::Handle* h = reinterpret_cast(arg2); + cache->Release(h); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/util.h b/utilities/titandb/util.h new file mode 100644 index 00000000000..99373ecdf07 --- /dev/null +++ b/utilities/titandb/util.h @@ -0,0 +1,44 @@ +#pragma once + +#include "rocksdb/cache.h" +#include "util/compression.h" +#include "util/testharness.h" +#include "utilities/titandb/blob_format.h" + +namespace rocksdb { +namespace titandb { + +template +void CheckCodec(const T& input) { + std::string buffer; + input.EncodeTo(&buffer); + T output; + ASSERT_OK(DecodeInto(buffer, &output)); + ASSERT_EQ(output, input); +} + +// Compresses the input data according to the compression type. +// Returns a slice with the output data and sets "*type" to the output +// compression type. +// +// If compression is actually performed, fills "*output" with the +// compressed data. However, if the compression ratio is not good, it +// returns the input slice directly and sets "*type" to +// kNoCompression. +Slice Compress(CompressionType* type, const Slice& input, std::string* output); + +// Uncompresses the input data according to the uncompression type. +// If successful, fills "*buffer" with the uncompressed data and +// points "*output" to it. +Status Uncompress(CompressionType type, const Slice& input, Slice* output, + std::unique_ptr* buffer); + +void UnrefCacheHandle(void* cache, void* handle); + +template +void DeleteCacheValue(const Slice&, void* value) { + delete reinterpret_cast(value); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/util_test.cc b/utilities/titandb/util_test.cc new file mode 100644 index 00000000000..b25c113dd6a --- /dev/null +++ b/utilities/titandb/util_test.cc @@ -0,0 +1,29 @@ +#include "util/testharness.h" +#include "utilities/titandb/util.h" + +namespace rocksdb { +namespace titandb { + +class UtilTest : public testing::Test {}; + +TEST(UtilTest, Compression) { + Slice input("aaaaaaaaaaaaaaaaaaaaaaaaaa"); + for (auto compression : { + kSnappyCompression, kZlibCompression, kLZ4Compression, kZSTD}) { + std::string buffer; + auto compressed = Compress(&compression, input, &buffer); + ASSERT_TRUE(compressed.size() <= input.size()); + Slice output; + std::unique_ptr uncompressed; + ASSERT_OK(Uncompress(compression, compressed, &output, &uncompressed)); + ASSERT_EQ(output, input); + } +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utilities/titandb/version.cc b/utilities/titandb/version.cc new file mode 100644 index 00000000000..839990bcb6d --- /dev/null +++ b/utilities/titandb/version.cc @@ -0,0 +1,120 @@ +#include "utilities/titandb/version.h" +#include "utilities/titandb/version_set.h" + +namespace rocksdb { +namespace titandb { + +Status BlobStorage::Get(const ReadOptions& options, const BlobIndex& index, + BlobRecord* record, PinnableSlice* buffer) { + auto sfile = FindFile(index.file_number).lock(); + if (!sfile) + return Status::Corruption("Missing blob file: " + + std::to_string(index.file_number)); + return file_cache_->Get(options, sfile->file_number, sfile->file_size, + index.blob_handle, record, buffer); +} + +Status BlobStorage::NewPrefetcher(uint64_t file_number, + std::unique_ptr* result) { + auto sfile = FindFile(file_number).lock(); + if (!sfile) + return Status::Corruption("Missing blob wfile: " + + std::to_string(file_number)); + return file_cache_->NewPrefetcher(sfile->file_number, sfile->file_size, + result); +} + +std::weak_ptr BlobStorage::FindFile(uint64_t file_number) { + auto it = files_.find(file_number); + if (it != files_.end()) { + return it->second; + } + return std::weak_ptr(); +} + +void BlobStorage::ComputeGCScore() { + gc_score_.clear(); + for (auto& file : files_) { + gc_score_.push_back({}); + auto& gcs = gc_score_.back(); + gcs.file_number = file.first; + if (file.second->marked_for_gc) { + gcs.score = 1; + file.second->marked_for_gc = false; + } else if (file.second->file_size < titan_cf_options_.merge_small_file_threashold) { + gcs.score = 1; + } else { + gcs.score = file.second->discardable_size / file.second->file_size; + } + } + + std::sort(gc_score_.begin(), gc_score_.end(), + [](const GCScore& first, const GCScore& second) { + return first.score > second.score; + }); +} + +Version::~Version() { + assert(refs_ == 0); + + // Remove linked list + prev_->next_ = next_; + next_->prev_ = prev_; + + // Drop references to files + // Close DB will also destruct this class and add live file to here. + // But don't worry, ~Version will call after all our code executed. + std::vector obsolete_blob_files; + for (auto& b : this->column_families_) { + if (b.second.use_count() > 1) continue; + for (auto& f : b.second->files_) { + if (f.second.use_count() > 1) continue; + obsolete_blob_files.emplace_back(f.second->file_number); + } + } + if (vset_ != nullptr) vset_->AddObsoleteBlobFiles(obsolete_blob_files); +} + +void Version::Ref() { refs_++; } + +void Version::Unref() { + refs_--; + if (refs_ == 0) { + delete this; + } +} + +std::weak_ptr Version::GetBlobStorage(uint32_t cf_id) { + auto it = column_families_.find(cf_id); + if (it != column_families_.end()) { + return it->second; + } + return std::weak_ptr(); +} + +VersionList::VersionList() { Append(new Version(nullptr)); } + +VersionList::~VersionList() { + current_->Unref(); + assert(list_.prev_ == &list_); + assert(list_.next_ == &list_); +} + +void VersionList::Append(Version* v) { + assert(v->refs_ == 0); + assert(v != current_); + + if (current_) { + current_->Unref(); + } + current_ = v; + current_->Ref(); + + v->prev_ = list_.prev_; + v->next_ = &list_; + v->prev_->next_ = v; + v->next_->prev_ = v; +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/version.h b/utilities/titandb/version.h new file mode 100644 index 00000000000..c9f2550ee84 --- /dev/null +++ b/utilities/titandb/version.h @@ -0,0 +1,127 @@ +#pragma once + +#include "rocksdb/options.h" +#include "utilities/titandb/blob_file_cache.h" +#include "utilities/titandb/blob_format.h" +#include "utilities/titandb/blob_gc.h" + +namespace rocksdb { +namespace titandb { + +class VersionSet; + +// Provides methods to access the blob storage for a specific +// version. The version must be valid when this storage is used. +class BlobStorage { + public: + BlobStorage(const BlobStorage& bs) { + this->files_ = bs.files_; + this->file_cache_ = bs.file_cache_; + this->titan_cf_options_ = bs.titan_cf_options_; + } + + BlobStorage(const TitanCFOptions& _options, + std::shared_ptr _file_cache) + : titan_cf_options_(_options), + file_cache_(_file_cache) {} + + // Gets the blob record pointed by the blob index. The provided + // buffer is used to store the record data, so the buffer must be + // valid when the record is used. + Status Get(const ReadOptions& options, const BlobIndex& index, + BlobRecord* record, PinnableSlice* buffer); + + // Creates a prefetcher for the specified file number. + // REQUIRES: mutex is held + Status NewPrefetcher(uint64_t file_number, + std::unique_ptr* result); + + // Finds the blob file meta for the specified file number. It is a + // corruption if the file doesn't exist in the specific version. + std::weak_ptr FindFile(uint64_t file_number); + + void MarkAllFilesForGC() { + for (auto& file : files_) + file.second->marked_for_gc = true; + } + + const std::vector gc_score() { return gc_score_; } + + void ComputeGCScore(); + + const TitanCFOptions& titan_cf_options() { + return titan_cf_options_; + } + + private: + friend class Version; + friend class VersionSet; + friend class VersionBuilder; + friend class VersionTest; + friend class BlobGCPickerTest; + friend class BlobGCJobTest; + friend class BlobFileSizeCollectorTest; + + TitanCFOptions titan_cf_options_; + // Only BlobStorage OWNS BlobFileMeta + std::map> files_; + std::shared_ptr file_cache_; + + std::vector gc_score_; +}; + +class Version { + public: + Version(VersionSet* vset) : vset_(vset), prev_(this), next_(this) {} + + // Reference count management. + // REQUIRES: mutex is held + void Ref(); + void Unref(); + + // Returns the blob storage for the specific column family. + // The version must be valid when the blob storage is used. + // Except Version, Nobody else can extend the life time of + // BlobStorage. Otherwise, It's a wrong design. Because + // BlobStorage only belongs to Version, Others only have + // the right to USE it. + std::weak_ptr GetBlobStorage(uint32_t cf_id); + + void MarkAllFilesForGC() { + for(auto& cf : column_families_) + cf.second->MarkAllFilesForGC(); + } + + private: + friend class VersionList; + friend class VersionBuilder; + friend class VersionSet; + friend class VersionTest; + friend class BlobFileSizeCollectorTest; + + ~Version(); + + VersionSet* vset_; + int refs_{0}; + Version* prev_; + Version* next_; + std::map> column_families_; +}; + +class VersionList { + public: + VersionList(); + + ~VersionList(); + + Version* current() { return current_; } + + void Append(Version* v); + + private: + Version list_{nullptr}; + Version* current_{nullptr}; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/version_builder.cc b/utilities/titandb/version_builder.cc new file mode 100644 index 00000000000..60b16153ab0 --- /dev/null +++ b/utilities/titandb/version_builder.cc @@ -0,0 +1,87 @@ +#include "utilities/titandb/version_builder.h" + +#include + +namespace rocksdb { +namespace titandb { + +void VersionBuilder::Builder::AddFile( + const std::shared_ptr& file) { + auto number = file->file_number; + auto sb = base_.lock(); + if (sb->files_.find(number) != sb->files_.end() || + added_files_.find(number) != added_files_.end()) { + fprintf(stderr, "blob file %" PRIu64 " has been added before\n", number); + abort(); + } + if (deleted_files_.find(number) != deleted_files_.end()) { + fprintf(stderr, "blob file %" PRIu64 " has been deleted before\n", number); + abort(); + } + added_files_.emplace(number, file); +} + +void VersionBuilder::Builder::DeleteFile(uint64_t number) { + auto sb = base_.lock(); + if (sb->files_.find(number) == sb->files_.end() && + added_files_.find(number) == added_files_.end()) { + fprintf(stderr, "blob file %" PRIu64 " doesn't exist before\n", number); + abort(); + } + if (deleted_files_.find(number) != deleted_files_.end()) { + fprintf(stderr, "blob file %" PRIu64 " has been deleted before\n", number); + abort(); + } + deleted_files_.emplace(number); +} + +std::shared_ptr VersionBuilder::Builder::Build() { + // If nothing is changed, we can reuse the base; + if (added_files_.empty() && deleted_files_.empty()) { + return base_.lock(); + } + + auto vs = std::make_shared(*base_.lock()); + vs->files_.insert(added_files_.begin(), added_files_.end()); + for (auto& file : deleted_files_) { + vs->files_.erase(file); + } + vs->ComputeGCScore(); + return vs; +} + +VersionBuilder::VersionBuilder(Version* base) : base_(base) { + base_->Ref(); + for (auto& it : base_->column_families_) { + column_families_.emplace(it.first, Builder(it.second)); + } +} + +VersionBuilder::~VersionBuilder() { base_->Unref(); } + +void VersionBuilder::Apply(VersionEdit* edit) { + auto cf_id = edit->column_family_id_; + auto it = column_families_.find(cf_id); + if (it == column_families_.end()) { + // Ignore unknown column families. + return; + } + auto& builder = it->second; + + for (auto& file : edit->deleted_files_) { + builder.DeleteFile(file); + } + for (auto& file : edit->added_files_) { + builder.AddFile(file); + } +} + +void VersionBuilder::SaveTo(Version* v) { + v->column_families_.clear(); + for (auto& it : column_families_) { + v->column_families_.emplace(it.first, it.second.Build()); + } +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/version_builder.h b/utilities/titandb/version_builder.h new file mode 100644 index 00000000000..74b1e08c544 --- /dev/null +++ b/utilities/titandb/version_builder.h @@ -0,0 +1,47 @@ +#pragma once + +#include "utilities/titandb/version.h" +#include "utilities/titandb/version_edit.h" + +namespace rocksdb { +namespace titandb { + +class VersionBuilder { + public: + // Constructs a builder to build on the base version. The + // intermediate result is kept in the builder and the base version + // is left unchanged. + VersionBuilder(Version* base); + + ~VersionBuilder(); + + // Applies "*edit" on the current state. + void Apply(VersionEdit* edit); + + // Saves the current state to the version "*v". + void SaveTo(Version* v); + + private: + friend class VersionTest; + + class Builder { + public: + Builder(std::shared_ptr base) : base_(base) {} + + void AddFile(const std::shared_ptr& file); + void DeleteFile(uint64_t number); + + std::shared_ptr Build(); + + private: + std::weak_ptr base_; + std::map> added_files_; + std::set deleted_files_; + }; + + Version* base_; + std::map column_families_; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/version_edit.cc b/utilities/titandb/version_edit.cc new file mode 100644 index 00000000000..533b555ea42 --- /dev/null +++ b/utilities/titandb/version_edit.cc @@ -0,0 +1,105 @@ +#include "utilities/titandb/version_edit.h" + +#include "util/coding.h" + +namespace rocksdb { +namespace titandb { + +enum Tag { + kNextFileNumber = 1, + kColumnFamilyID = 10, + kAddedBlobFile = 11, + kDeletedBlobFile = 12, +}; + +void VersionEdit::EncodeTo(std::string* dst) const { + if (has_next_file_number_) { + PutVarint32Varint64(dst, kNextFileNumber, next_file_number_); + } + + PutVarint32Varint32(dst, kColumnFamilyID, column_family_id_); + + for (auto& file : added_files_) { + PutVarint32(dst, kAddedBlobFile); + file->EncodeTo(dst); + } + for (auto& file : deleted_files_) { + PutVarint32Varint64(dst, kDeletedBlobFile, file); + } +} + +Status VersionEdit::DecodeFrom(Slice* src) { + uint32_t tag; + uint64_t file_number; + std::shared_ptr blob_file; + + const char* error = nullptr; + while (!error && !src->empty()) { + if (!GetVarint32(src, &tag)) { + error = "invalid tag"; + break; + } + switch (tag) { + case kNextFileNumber: + if (GetVarint64(src, &next_file_number_)) { + has_next_file_number_ = true; + } else { + error = "next file number"; + } + break; + case kColumnFamilyID: + if (GetVarint32(src, &column_family_id_)) { + } else { + error = "column family id"; + } + break; + case kAddedBlobFile: + blob_file = std::make_shared(); + if (blob_file->DecodeFrom(src).ok()) { + AddBlobFile(blob_file); + } else { + error = "added blob file"; + } + break; + case kDeletedBlobFile: + if (GetVarint64(src, &file_number)) { + DeleteBlobFile(file_number); + } else { + error = "deleted blob file"; + } + break; + default: + error = "unknown tag"; + break; + } + } + + if (error) { + return Status::Corruption("VersionEdit", error); + } + return Status::OK(); +} + +bool operator==(const VersionEdit& lhs, const VersionEdit& rhs) { + if (lhs.added_files_.size() != rhs.added_files_.size()) { + return false; + } + std::map> blob_files; + for (std::size_t idx = 0; idx < lhs.added_files_.size(); idx++) { + blob_files.insert( + {lhs.added_files_[idx]->file_number, lhs.added_files_[idx]}); + } + for (std::size_t idx = 0; idx < rhs.added_files_.size(); idx++) { + auto iter = blob_files.find(rhs.added_files_[idx]->file_number); + if (iter == blob_files.end() || !(*iter->second == *rhs.added_files_[idx])) + return false; + } + + return (lhs.has_next_file_number_ == rhs.has_next_file_number_ && + lhs.next_file_number_ == rhs.next_file_number_ && + lhs.column_family_id_ == rhs.column_family_id_ && + lhs.deleted_files_ == rhs.deleted_files_); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/version_edit.h b/utilities/titandb/version_edit.h new file mode 100644 index 00000000000..d1fd628f9d4 --- /dev/null +++ b/utilities/titandb/version_edit.h @@ -0,0 +1,46 @@ +#pragma once + +#include + +#include "rocksdb/slice.h" +#include "utilities/titandb/blob_format.h" + +namespace rocksdb { +namespace titandb { + +class VersionEdit { + public: + void SetNextFileNumber(uint64_t v) { + has_next_file_number_ = true; + next_file_number_ = v; + } + + void SetColumnFamilyID(uint32_t v) { column_family_id_ = v; } + + void AddBlobFile(std::shared_ptr file) { + added_files_.push_back(file); + } + + void DeleteBlobFile(uint64_t file_number) { + deleted_files_.push_back(file_number); + } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* src); + + friend bool operator==(const VersionEdit& lhs, const VersionEdit& rhs); + + private: + friend class VersionSet; + friend class VersionBuilder; + + bool has_next_file_number_{false}; + uint64_t next_file_number_{0}; + uint32_t column_family_id_{0}; + + std::vector> added_files_; + std::vector deleted_files_; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/version_set.cc b/utilities/titandb/version_set.cc new file mode 100644 index 00000000000..72305b219e2 --- /dev/null +++ b/utilities/titandb/version_set.cc @@ -0,0 +1,246 @@ +#include "utilities/titandb/version_set.h" + +#include + +#include "util/filename.h" +#include "utilities/titandb/version_builder.h" + +namespace rocksdb { +namespace titandb { + +const size_t kMaxFileCacheSize = 1024 * 1024; + +VersionSet::VersionSet(const TitanDBOptions& options) + : dirname_(options.dirname), + env_(options.env), + env_options_(options), + db_options_(options) { + auto file_cache_size = db_options_.max_open_files; + if (file_cache_size < 0) { + file_cache_size = kMaxFileCacheSize; + } + file_cache_ = NewLRUCache(file_cache_size); +} + +Status VersionSet::Open( + const std::map& column_families) { + // Sets up initial column families. + AddColumnFamilies( column_families) ; + + Status s = env_->FileExists(CurrentFileName(dirname_)); + if (s.ok()) { + return Recover(); + } + if (!s.IsNotFound()) { + return s; + } + if (!db_options_.create_if_missing) { + return Status::InvalidArgument(dirname_, + "does't exist (create_if_missing is false)"); + } + return OpenManifest(NewFileNumber()); +} + +Status VersionSet::Recover() { + struct LogReporter : public log::Reader::Reporter { + Status* status; + void Corruption(size_t, const Status& s) override { + if (status->ok()) *status = s; + } + }; + + // Reads "CURRENT" file, which contains the name of the current manifest file. + std::string manifest; + Status s = ReadFileToString(env_, CurrentFileName(dirname_), &manifest); + if (!s.ok()) return s; + if (manifest.empty() || manifest.back() != '\n') { + return Status::Corruption("CURRENT file does not end with newline"); + } + manifest.resize(manifest.size() - 1); + + // Opens the current manifest file. + auto file_name = dirname_ + "/" + manifest; + std::unique_ptr file; + { + std::unique_ptr f; + s = env_->NewSequentialFile(file_name, &f, + env_->OptimizeForManifestRead(env_options_)); + if (!s.ok()) return s; + file.reset(new SequentialFileReader(std::move(f))); + } + + bool has_next_file_number = false; + uint64_t next_file_number = 0; + + // Reads edits from the manifest and applies them one by one. + VersionBuilder builder(current()); + { + LogReporter reporter; + reporter.status = &s; + log::Reader reader(nullptr, std::move(file), &reporter, true /*checksum*/, + 0 /*initial_offset*/, 0); + Slice record; + std::string scratch; + while (reader.ReadRecord(&record, &scratch) && s.ok()) { + VersionEdit edit; + s = DecodeInto(record, &edit); + if (!s.ok()) return s; + builder.Apply(&edit); + if (edit.has_next_file_number_) { + next_file_number = edit.next_file_number_; + has_next_file_number = true; + } + } + } + + if (!has_next_file_number) { + return Status::Corruption("no next file number in manifest file"); + } + next_file_number_.store(next_file_number); + + auto v = new Version(this); + { + builder.SaveTo(v); + versions_.Append(v); + } + + auto new_manifest_file_number = NewFileNumber(); + s = OpenManifest(new_manifest_file_number); + if (!s.ok()) return s; + + v = versions_.current(); + + // Make sure perform gc on all files at the beginning + v->MarkAllFilesForGC(); + + // Purge inactive files at start + std::set alive_files; + alive_files.insert(new_manifest_file_number); + for (const auto& bs : v->column_families_) { + for (const auto& f : bs.second->files_) { + alive_files.insert(f.second->file_number); + } + } + std::vector files; + env_->GetChildren(dirname_, &files); + for (const auto& f : files) { + uint64_t file_number; + FileType file_type; + if (!ParseFileName(f, &file_number, &file_type)) continue; + if (alive_files.find(file_number) != alive_files.end()) continue; + if (file_type != FileType::kBlobFile && file_type != FileType::kDescriptorFile) + continue; + + env_->DeleteFile(dirname_ + "/" + f); + } + + return Status::OK(); +} + +Status VersionSet::OpenManifest(uint64_t file_number) { + Status s; + + auto file_name = DescriptorFileName(dirname_, file_number); + std::unique_ptr file; + { + std::unique_ptr f; + s = env_->NewWritableFile(file_name, &f, env_options_); + if (!s.ok()) return s; + file.reset(new WritableFileWriter(std::move(f), env_options_)); + } + + manifest_.reset(new log::Writer(std::move(file), 0, false)); + + // Saves current snapshot + s = WriteSnapshot(manifest_.get()); + if (s.ok()) { + ImmutableDBOptions ioptions(db_options_); + s = SyncManifest(env_, &ioptions, manifest_->file()); + } + if (s.ok()) { + // Makes "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dirname_, file_number, nullptr); + } + + if (!s.ok()) { + manifest_.reset(); + obsolete_files_.manifests.emplace_back(file_name); + } + return s; +} + +Status VersionSet::WriteSnapshot(log::Writer* log) { + Status s; + // Saves global information + { + VersionEdit edit; + edit.SetNextFileNumber(next_file_number_.load()); + std::string record; + edit.EncodeTo(&record); + s = log->AddRecord(record); + if (!s.ok()) return s; + } + // Saves column families information + for (auto& it : current()->column_families_) { + VersionEdit edit; + edit.SetColumnFamilyID(it.first); + for (auto& file : it.second->files_) { + edit.AddBlobFile(file.second); + } + std::string record; + edit.EncodeTo(&record); + s = log->AddRecord(record); + if (!s.ok()) return s; + } + return s; +} + +Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mutex) { + mutex->AssertHeld(); + + // TODO(@huachao): write manifest file unlocked + std::string record; + edit->SetNextFileNumber(next_file_number_.load()); + edit->EncodeTo(&record); + Status s = manifest_->AddRecord(record); + if (s.ok()) { + ImmutableDBOptions ioptions(db_options_); + s = SyncManifest(env_, &ioptions, manifest_->file()); + } + if (!s.ok()) return s; + + auto v = new Version(this); + { + VersionBuilder builder(current()); + builder.Apply(edit); + builder.SaveTo(v); + versions_.Append(v); + } + return s; +} + +void VersionSet::AddColumnFamilies( + const std::map& column_families) { + auto v = new Version(this); + v->column_families_ = current()->column_families_; + for (auto& cf : column_families) { + auto file_cache = std::make_shared( + db_options_, cf.second, file_cache_); + auto blob_storage = std::make_shared(cf.second, file_cache); + v->column_families_.emplace(cf.first, blob_storage); + } + versions_.Append(v); +} + +void VersionSet::DropColumnFamilies( + const std::vector& column_families) { + auto v = new Version(this); + v->column_families_ = current()->column_families_; + for (auto& cf : column_families) { + v->column_families_.erase(cf); + } + versions_.Append(v); +} + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/version_set.h b/utilities/titandb/version_set.h new file mode 100644 index 00000000000..7c4c85f8cfa --- /dev/null +++ b/utilities/titandb/version_set.h @@ -0,0 +1,104 @@ +#pragma once + +#include +#include + +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "port/port_posix.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "util/mutexlock.h" +#include "utilities/titandb/blob_file_cache.h" +#include "utilities/titandb/options.h" +#include "utilities/titandb/version.h" +#include "utilities/titandb/version_builder.h" +#include "utilities/titandb/version_edit.h" + +namespace rocksdb { +namespace titandb { + +struct ObsoleteFiles { + ObsoleteFiles() = default; + + ObsoleteFiles(const ObsoleteFiles&) = delete; + ObsoleteFiles& operator=(const ObsoleteFiles&) = delete; + ObsoleteFiles(ObsoleteFiles&&) = delete; + ObsoleteFiles& operator=(ObsoleteFiles&&) = delete; + + void Swap(ObsoleteFiles* obsolete_file) { + blob_files.swap(obsolete_file->blob_files); + manifests.swap(obsolete_file->manifests); + } + + std::vector blob_files; + std::vector manifests; +}; + +class VersionSet { + public: + explicit VersionSet(const TitanDBOptions& options); + + // Sets up the storage specified in "options.dirname". + // If the manifest doesn't exist, it will create one. + // If the manifest exists, it will recover from the latest one. + // It is a corruption if the persistent storage contains data + // outside of the provided column families. + Status Open(const std::map& column_families); + + // Applies *edit on the current version to form a new version that is + // both saved to the manifest and installed as the new current version. + // REQUIRES: *mutex is held + Status LogAndApply(VersionEdit* edit, port::Mutex* mutex); + + // Adds some column families with the specified options. + // REQUIRES: mutex is held + void AddColumnFamilies( + const std::map& column_families); + // Drops some column families. The obsolete files will be deleted in + // background when they will not be accessed anymore. + // REQUIRES: mutex is held + void DropColumnFamilies(const std::vector& column_families); + + // Returns the current version. + Version* current() { return versions_.current(); } + + // Allocates a new file number. + uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } + + // REQUIRES: mutex is held + void GetObsoleteFiles(ObsoleteFiles* obsolete_files) { + obsolete_files->Swap(&obsolete_files_); + } + + void AddObsoleteBlobFiles(const std::vector& blob_files) { + obsolete_files_.blob_files.insert(obsolete_files_.blob_files.end(), + blob_files.begin(), blob_files.end()); + } + + private: + friend class BlobFileSizeCollectorTest; + friend class VersionTest; + + Status Recover(); + + Status OpenManifest(uint64_t number); + + Status WriteSnapshot(log::Writer* log); + + std::string dirname_; + Env* env_; + EnvOptions env_options_; + TitanDBOptions db_options_; + std::shared_ptr file_cache_; + // This field will be call when Version is destructed, so we have to make + // sure this field is destructed after Version does. + ObsoleteFiles obsolete_files_; + + VersionList versions_; + std::unique_ptr manifest_; + std::atomic next_file_number_{1}; +}; + +} // namespace titandb +} // namespace rocksdb diff --git a/utilities/titandb/version_test.cc b/utilities/titandb/version_test.cc new file mode 100644 index 00000000000..9244793cc66 --- /dev/null +++ b/utilities/titandb/version_test.cc @@ -0,0 +1,202 @@ +#include "utilities/titandb/version.h" +#include "util/filename.h" +#include "util/testharness.h" +#include "utilities/titandb/util.h" +#include "utilities/titandb/version_builder.h" +#include "utilities/titandb/version_edit.h" +#include "utilities/titandb/version_set.h" + +namespace rocksdb { +namespace titandb { + +void DeleteDir(Env* env, const std::string& dirname) { + std::vector filenames; + env->GetChildren(dirname, &filenames); + for (auto& fname : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(fname, &number, &type)) { + ASSERT_OK(env->DeleteFile(dirname + "/" + fname)); + } + } + env->DeleteDir(dirname); +} + +class VersionTest : public testing::Test { + public: + TitanDBOptions db_options_; + TitanCFOptions cf_options_; + std::unique_ptr versions_; + std::shared_ptr file_cache_; + std::map> column_families_; + std::unique_ptr vset_; + port::Mutex mutex_; + std::string dbname_; + Env* env_; + + VersionTest() : dbname_(test::TmpDir()), env_(Env::Default()) { + db_options_.dirname = dbname_ + "/titandb"; + db_options_.create_if_missing = true; + env_->CreateDirIfMissing(dbname_); + env_->CreateDirIfMissing(db_options_.dirname); + auto cache = NewLRUCache(db_options_.max_open_files); + file_cache_.reset(new BlobFileCache(db_options_, cf_options_, cache)); + Reset(); + } + + void Reset() { + DeleteDir(env_, dbname_); + vset_.reset(new VersionSet(db_options_)); + ASSERT_OK(vset_->Open({})); + versions_.reset(new VersionList); + column_families_.clear(); + // Sets up some column families. + auto v = new Version(nullptr); + for (uint32_t id = 0; id < 10; id++) { + std::shared_ptr storage; + storage.reset(new BlobStorage(cf_options_, file_cache_)); + column_families_.emplace(id, storage); + storage.reset(new BlobStorage(cf_options_, file_cache_)); + v->column_families_.emplace(id, storage); + } + versions_->Append(v); + } + + void AddBlobFiles(uint32_t cf_id, uint64_t start, uint64_t end) { + auto storage = column_families_[cf_id]; + for (auto i = start; i < end; i++) { + auto file = std::make_shared(); + file->file_number = i; + file->file_size = i; + storage->files_.emplace(i, file); + } + } + + void DeleteBlobFiles(uint32_t cf_id, uint64_t start, uint64_t end) { + auto& storage = column_families_[cf_id]; + for (auto i = start; i < end; i++) { + storage->files_.erase(i); + } + } + + void BuildAndCheck(std::vector edits) { + VersionBuilder builder(versions_->current()); + for (auto& edit : edits) { + builder.Apply(&edit); + } + Version* v = new Version(vset_.get()); + builder.SaveTo(v); + versions_->Append(v); + for (auto& it : v->column_families_) { + auto& storage = column_families_[it.first]; + ASSERT_EQ(storage->files_.size(), it.second->files_.size()); + for (auto& f : storage->files_) { + auto iter = it.second->files_.find(f.first); + ASSERT_TRUE(iter != it.second->files_.end()); + ASSERT_EQ(*f.second, *(iter->second)); + } + } + } +}; + +TEST_F(VersionTest, VersionEdit) { + VersionEdit input; + CheckCodec(input); + input.SetNextFileNumber(1); + input.SetColumnFamilyID(2); + CheckCodec(input); + auto file1 = std::make_shared(); + file1->file_number = 3; + file1->file_size = 4; + auto file2 = std::make_shared(); + file2->file_number = 5; + file2->file_size = 6; + input.AddBlobFile(file1); + input.AddBlobFile(file2); + input.DeleteBlobFile(7); + input.DeleteBlobFile(8); + CheckCodec(input); +} + +VersionEdit AddBlobFilesEdit(uint32_t cf_id, uint64_t start, uint64_t end) { + VersionEdit edit; + edit.SetColumnFamilyID(cf_id); + for (auto i = start; i < end; i++) { + auto file = std::make_shared(); + file->file_number = i; + file->file_size = i; + edit.AddBlobFile(file); + } + return edit; +} + +VersionEdit DeleteBlobFilesEdit(uint32_t cf_id, uint64_t start, uint64_t end) { + VersionEdit edit; + edit.SetColumnFamilyID(cf_id); + for (auto i = start; i < end; i++) { + edit.DeleteBlobFile(i); + } + return edit; +} + +TEST_F(VersionTest, VersionBuilder) { + // {(0, 4)}, {} + auto add1_0_4 = AddBlobFilesEdit(1, 0, 4); + AddBlobFiles(1, 0, 4); + BuildAndCheck({add1_0_4}); + + // {(0, 8)}, {(4, 8)} + auto add1_4_8 = AddBlobFilesEdit(1, 4, 8); + auto add2_4_8 = AddBlobFilesEdit(2, 4, 8); + AddBlobFiles(1, 4, 8); + AddBlobFiles(2, 4, 8); + BuildAndCheck({add1_4_8, add2_4_8}); + + // {(0, 4), (6, 8)}, {(4, 8)} + auto del1_4_6 = DeleteBlobFilesEdit(1, 4, 6); + DeleteBlobFiles(1, 4, 6); + BuildAndCheck({del1_4_6}); + + // {(0, 4)}, {(4, 6)} + auto del1_6_8 = DeleteBlobFilesEdit(1, 6, 8); + auto del2_6_8 = DeleteBlobFilesEdit(2, 6, 8); + DeleteBlobFiles(1, 6, 8); + DeleteBlobFiles(2, 6, 8); + BuildAndCheck({del1_6_8, del2_6_8}); + BuildAndCheck({add1_4_8, del1_4_6, del1_6_8}); + + // {(0, 4)}, {(4, 6)} + Reset(); + AddBlobFiles(1, 0, 4); + AddBlobFiles(2, 4, 6); + BuildAndCheck({add1_0_4, add1_4_8, del1_4_6, del1_6_8, add2_4_8, del2_6_8}); +} + +TEST_F(VersionTest, ObsoleteFiles) { + std::map m; + m.insert({1, TitanCFOptions()}); + vset_->AddColumnFamilies(m); + { + auto add1_0_4 = AddBlobFilesEdit(1, 0, 4); + MutexLock l(&mutex_); + vset_->LogAndApply(&add1_0_4, &mutex_); + } + ObsoleteFiles of; + vset_->GetObsoleteFiles(&of); + ASSERT_EQ(of.blob_files.size(), 0); + { + auto del1_3_4 = DeleteBlobFilesEdit(1, 3, 4); + MutexLock l(&mutex_); + vset_->LogAndApply(&del1_3_4, &mutex_); + } + vset_->GetObsoleteFiles(&of); + ASSERT_EQ(of.blob_files.size(), 1); +} + +} // namespace titandb +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}