diff --git a/cpp/src/parquet/api/io.h b/cpp/src/parquet/api/io.h index 683dae27d6fdc..96d3bc062af66 100644 --- a/cpp/src/parquet/api/io.h +++ b/cpp/src/parquet/api/io.h @@ -19,9 +19,6 @@ #define PARQUET_API_IO_H #include "parquet/exception.h" -#include "parquet/util/buffer.h" -#include "parquet/util/input.h" -#include "parquet/util/mem-allocator.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" #endif // PARQUET_API_IO_H diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 37e48944665fb..20f6670b5fa59 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -19,7 +19,6 @@ # parquet_arrow : Arrow <-> Parquet adapter set(PARQUET_ARROW_SRCS - io.cc reader.cc schema.cc writer.cc @@ -76,16 +75,13 @@ if (PARQUET_BUILD_STATIC) endif() ADD_PARQUET_TEST(arrow-schema-test) -ADD_PARQUET_TEST(arrow-io-test) ADD_PARQUET_TEST(arrow-reader-writer-test) if (PARQUET_BUILD_STATIC) ADD_PARQUET_LINK_LIBRARIES(arrow-schema-test parquet_arrow_static) - ADD_PARQUET_LINK_LIBRARIES(arrow-io-test parquet_arrow_static) ADD_PARQUET_LINK_LIBRARIES(arrow-reader-writer-test parquet_arrow_static) else() ADD_PARQUET_LINK_LIBRARIES(arrow-schema-test parquet_arrow_shared) - ADD_PARQUET_LINK_LIBRARIES(arrow-io-test parquet_arrow_shared) ADD_PARQUET_LINK_LIBRARIES(arrow-reader-writer-test parquet_arrow_shared) endif() @@ -100,7 +96,6 @@ endif() # Headers: top level install(FILES - io.h reader.h schema.h utils.h diff --git a/cpp/src/parquet/arrow/arrow-io-test.cc b/cpp/src/parquet/arrow/arrow-io-test.cc deleted file mode 100644 index 6d76887d2626b..0000000000000 --- a/cpp/src/parquet/arrow/arrow-io-test.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include - -#include "gtest/gtest.h" - -#include "arrow/api.h" -#include "arrow/io/memory.h" -#include "arrow/test-util.h" - -#include "parquet/api/io.h" -#include "parquet/arrow/io.h" - -using arrow::default_memory_pool; -using arrow::MemoryPool; -using arrow::Status; - -using ArrowBufferReader = arrow::io::BufferReader; - -namespace parquet { -namespace arrow { - -// Allocator tests - -TEST(TestParquetAllocator, DefaultCtor) { - ParquetAllocator allocator; - - const int buffer_size = 10; - - uint8_t* buffer = nullptr; - ASSERT_NO_THROW(buffer = allocator.Malloc(buffer_size);); - - // valgrind will complain if we write into nullptr - memset(buffer, 0, buffer_size); - - allocator.Free(buffer, buffer_size); -} - -// Pass through to the default memory pool -class TrackingPool : public MemoryPool { - public: - TrackingPool() : pool_(default_memory_pool()), bytes_allocated_(0) {} - - Status Allocate(int64_t size, uint8_t** out) override { - RETURN_NOT_OK(pool_->Allocate(size, out)); - bytes_allocated_ += size; - return Status::OK(); - } - - void Free(uint8_t* buffer, int64_t size) override { - pool_->Free(buffer, size); - bytes_allocated_ -= size; - } - - int64_t bytes_allocated() const override { return bytes_allocated_; } - - private: - MemoryPool* pool_; - int64_t bytes_allocated_; -}; - -TEST(TestParquetAllocator, CustomPool) { - TrackingPool pool; - - ParquetAllocator allocator(&pool); - - ASSERT_EQ(&pool, allocator.pool()); - - const int buffer_size = 10; - - uint8_t* buffer = nullptr; - ASSERT_NO_THROW(buffer = allocator.Malloc(buffer_size);); - - ASSERT_EQ(buffer_size, pool.bytes_allocated()); - - // valgrind will complain if we write into nullptr - memset(buffer, 0, buffer_size); - - allocator.Free(buffer, buffer_size); - - ASSERT_EQ(0, pool.bytes_allocated()); -} - -// ---------------------------------------------------------------------- -// Read source tests - -TEST(TestParquetReadSource, Basics) { - std::string data = "this is the data"; - auto data_buffer = reinterpret_cast(data.c_str()); - - ParquetAllocator allocator(default_memory_pool()); - - auto file = std::make_shared(data_buffer, data.size()); - auto source = std::make_shared(&allocator); - - ASSERT_OK(source->Open(file)); - - ASSERT_EQ(0, source->Tell()); - ASSERT_NO_THROW(source->Seek(5)); - ASSERT_EQ(5, source->Tell()); - ASSERT_NO_THROW(source->Seek(0)); - - // Seek out of bounds - ASSERT_THROW(source->Seek(100), ParquetException); - - uint8_t buffer[50]; - - ASSERT_NO_THROW(source->Read(4, buffer)); - ASSERT_EQ(0, std::memcmp(buffer, "this", 4)); - ASSERT_EQ(4, source->Tell()); - - std::shared_ptr pq_buffer; - - ASSERT_NO_THROW(pq_buffer = source->Read(7)); - - auto expected_buffer = std::make_shared(data_buffer + 4, 7); - - ASSERT_TRUE(expected_buffer->Equals(*pq_buffer.get())); -} - -} // namespace arrow -} // namespace parquet diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-benchmark.cc b/cpp/src/parquet/arrow/arrow-reader-writer-benchmark.cc index 89cb48649b465..cf90ebc2bdab5 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-benchmark.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-benchmark.cc @@ -23,7 +23,7 @@ #include "parquet/column/writer.h" #include "parquet/file/reader-internal.h" #include "parquet/file/writer-internal.h" -#include "parquet/util/input.h" +#include "parquet/util/memory.h" #include "arrow/api.h" @@ -132,8 +132,8 @@ static void BM_ReadColumn(::benchmark::State& state) { std::shared_ptr buffer = output->GetBuffer(); while (state.KeepRunning()) { - auto reader = ParquetFileReader::Open( - std::unique_ptr(new BufferReader(buffer))); + auto reader = + ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); FileReader filereader(::arrow::default_memory_pool(), std::move(reader)); std::shared_ptr<::arrow::Table> table; filereader.ReadFlatTable(&table); diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 6d2b0d5f85502..07ddd915a7507 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -29,14 +29,15 @@ #include "arrow/test-util.h" using arrow::Array; +using arrow::Buffer; using arrow::ChunkedArray; using arrow::default_memory_pool; +using arrow::io::BufferReader; using arrow::PoolBuffer; using arrow::PrimitiveArray; using arrow::Status; using arrow::Table; -using ParquetBuffer = parquet::Buffer; using ParquetType = parquet::Type; using parquet::schema::GroupNode; using parquet::schema::NodePtr; @@ -203,9 +204,8 @@ class TestParquetIO : public ::testing::Test { } std::unique_ptr ReaderFromSink() { - std::shared_ptr buffer = sink_->GetBuffer(); - std::unique_ptr source(new BufferReader(buffer)); - return ParquetFileReader::Open(std::move(source)); + std::shared_ptr buffer = sink_->GetBuffer(); + return ParquetFileReader::Open(std::make_shared(buffer)); } void ReadSingleColumnFile( @@ -357,9 +357,9 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWriteArrowIO) { ASSERT_OK_NO_THROW(WriteFlatTable( table.get(), default_memory_pool(), arrow_sink_, 512, default_writer_properties())); - std::shared_ptr pbuffer = - std::make_shared(buffer->data(), buffer->size()); - std::unique_ptr source(new BufferReader(pbuffer)); + auto pbuffer = std::make_shared(buffer->data(), buffer->size()); + + auto source = std::make_shared(pbuffer); std::shared_ptr<::arrow::Table> out; this->ReadTableFromFile(ParquetFileReader::Open(std::move(source)), &out); ASSERT_EQ(1, out->num_columns()); diff --git a/cpp/src/parquet/arrow/io.cc b/cpp/src/parquet/arrow/io.cc deleted file mode 100644 index 2b1f99de55a5f..0000000000000 --- a/cpp/src/parquet/arrow/io.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/arrow/io.h" - -#include -#include - -#include "parquet/api/io.h" -#include "parquet/arrow/utils.h" - -#include "arrow/status.h" - -using arrow::Status; -using arrow::MemoryPool; - -// To assist with readability -using ArrowROFile = arrow::io::ReadableFileInterface; - -namespace parquet { -namespace arrow { - -// ---------------------------------------------------------------------- -// ParquetAllocator - -ParquetAllocator::ParquetAllocator() : pool_(::arrow::default_memory_pool()) {} - -ParquetAllocator::ParquetAllocator(MemoryPool* pool) : pool_(pool) {} - -ParquetAllocator::~ParquetAllocator() {} - -uint8_t* ParquetAllocator::Malloc(int64_t size) { - uint8_t* result; - PARQUET_THROW_NOT_OK(pool_->Allocate(size, &result)); - return result; -} - -void ParquetAllocator::Free(uint8_t* buffer, int64_t size) { - // Does not report Status - pool_->Free(buffer, size); -} - -// ---------------------------------------------------------------------- -// ParquetReadSource - -ParquetReadSource::ParquetReadSource(ParquetAllocator* allocator) - : file_(nullptr), allocator_(allocator) {} - -Status ParquetReadSource::Open(const std::shared_ptr& file) { - int64_t file_size; - RETURN_NOT_OK(file->GetSize(&file_size)); - - file_ = file; - size_ = file_size; - return Status::OK(); -} - -void ParquetReadSource::Close() { - // TODO(wesm): Make this a no-op for now. This leaves Python wrappers for - // these classes in a borked state. Probably better to explicitly close. - - // PARQUET_THROW_NOT_OK(file_->Close()); -} - -int64_t ParquetReadSource::Tell() const { - int64_t position; - PARQUET_THROW_NOT_OK(file_->Tell(&position)); - return position; -} - -void ParquetReadSource::Seek(int64_t position) { - PARQUET_THROW_NOT_OK(file_->Seek(position)); -} - -int64_t ParquetReadSource::Read(int64_t nbytes, uint8_t* out) { - int64_t bytes_read; - PARQUET_THROW_NOT_OK(file_->Read(nbytes, &bytes_read, out)); - return bytes_read; -} - -std::shared_ptr ParquetReadSource::Read(int64_t nbytes) { - // TODO(wesm): This code is duplicated from parquet/util/input.cc; suggests - // that there should be more code sharing amongst file-like sources - auto result = std::make_shared(0, allocator_); - result->Resize(nbytes); - - int64_t bytes_read = Read(nbytes, result->mutable_data()); - if (bytes_read < nbytes) { result->Resize(bytes_read); } - return result; -} - -ParquetWriteSink::ParquetWriteSink( - const std::shared_ptr<::arrow::io::OutputStream>& stream) - : stream_(stream) {} - -ParquetWriteSink::~ParquetWriteSink() {} - -void ParquetWriteSink::Close() { - PARQUET_THROW_NOT_OK(stream_->Close()); -} - -int64_t ParquetWriteSink::Tell() { - int64_t position; - PARQUET_THROW_NOT_OK(stream_->Tell(&position)); - return position; -} - -void ParquetWriteSink::Write(const uint8_t* data, int64_t length) { - PARQUET_THROW_NOT_OK(stream_->Write(data, length)); -} - -} // namespace arrow -} // namespace parquet diff --git a/cpp/src/parquet/arrow/io.h b/cpp/src/parquet/arrow/io.h deleted file mode 100644 index a1de9366c1203..0000000000000 --- a/cpp/src/parquet/arrow/io.h +++ /dev/null @@ -1,101 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Bridges Arrow's IO interfaces and Parquet-cpp's IO interfaces - -#ifndef PARQUET_ARROW_IO_H -#define PARQUET_ARROW_IO_H - -#include -#include - -#include "parquet/api/io.h" - -#include "arrow/io/interfaces.h" -#include "arrow/memory_pool.h" - -namespace parquet { - -namespace arrow { - -// An implementation of the Parquet MemoryAllocator API that plugs into an -// existing Arrow memory pool. This way we can direct all allocations to a -// single place rather than tracking allocations in different locations (for -// example: without utilizing parquet-cpp's default allocator) -class PARQUET_EXPORT ParquetAllocator : public MemoryAllocator { - public: - // Uses the default memory pool - ParquetAllocator(); - - explicit ParquetAllocator(::arrow::MemoryPool* pool); - virtual ~ParquetAllocator(); - - uint8_t* Malloc(int64_t size) override; - void Free(uint8_t* buffer, int64_t size) override; - - void set_pool(::arrow::MemoryPool* pool) { pool_ = pool; } - - ::arrow::MemoryPool* pool() const { return pool_; } - - private: - ::arrow::MemoryPool* pool_; -}; - -class PARQUET_EXPORT ParquetReadSource : public RandomAccessSource { - public: - explicit ParquetReadSource(ParquetAllocator* allocator); - - // We need to ask for the file size on opening the file, and this can fail - ::arrow::Status Open(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file); - - void Close() override; - int64_t Tell() const override; - void Seek(int64_t pos) override; - int64_t Read(int64_t nbytes, uint8_t* out) override; - std::shared_ptr Read(int64_t nbytes) override; - - private: - // An Arrow readable file of some kind - std::shared_ptr<::arrow::io::ReadableFileInterface> file_; - - // The allocator is required for creating managed buffers - ParquetAllocator* allocator_; -}; - -class PARQUET_EXPORT ParquetWriteSink : public OutputStream { - public: - explicit ParquetWriteSink(const std::shared_ptr<::arrow::io::OutputStream>& stream); - - virtual ~ParquetWriteSink(); - - // Close the output stream - void Close() override; - - // Return the current position in the output stream relative to the start - int64_t Tell() override; - - // Copy bytes into the output stream - void Write(const uint8_t* data, int64_t length) override; - - private: - std::shared_ptr<::arrow::io::OutputStream> stream_; -}; - -} // namespace arrow -} // namespace parquet - -#endif // PARQUET_ARROW_IO_H diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 135867ccb0207..d1eec05fc3614 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -23,9 +23,7 @@ #include #include -#include "parquet/arrow/io.h" #include "parquet/arrow/schema.h" -#include "parquet/arrow/utils.h" #include "arrow/api.h" #include "arrow/type_traits.h" @@ -40,7 +38,6 @@ using arrow::Status; using arrow::Table; // Help reduce verbosity -using ParquetRAS = parquet::RandomAccessSource; using ParquetReader = parquet::ParquetFileReader; namespace parquet { @@ -193,16 +190,11 @@ FileReader::~FileReader() {} // Static ctor Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, - ParquetAllocator* allocator, std::unique_ptr* reader) { - std::unique_ptr source(new ParquetReadSource(allocator)); - RETURN_NOT_OK(source->Open(file)); - + MemoryPool* allocator, std::unique_ptr* reader) { // TODO(wesm): reader properties std::unique_ptr pq_reader; - PARQUET_CATCH_NOT_OK(pq_reader = ParquetReader::Open(std::move(source))); - - // Use the same memory pool as the ParquetAllocator - reader->reset(new FileReader(allocator->pool(), std::move(pq_reader))); + PARQUET_CATCH_NOT_OK(pq_reader = ParquetReader::Open(file)); + reader->reset(new FileReader(allocator, std::move(pq_reader))); return Status::OK(); } @@ -352,18 +344,18 @@ Status FlatColumnReader::Impl::TypedReadBatch( RETURN_NOT_OK(InitDataBuffer(batch_size)); valid_bits_idx_ = 0; if (descr_->max_definition_level() > 0) { - valid_bits_buffer_ = std::make_shared(pool_); int valid_bits_size = ::arrow::BitUtil::CeilByte(batch_size) / 8; - valid_bits_buffer_->Resize(valid_bits_size); + valid_bits_buffer_ = std::make_shared(pool_); + RETURN_NOT_OK(valid_bits_buffer_->Resize(valid_bits_size)); valid_bits_ptr_ = valid_bits_buffer_->mutable_data(); memset(valid_bits_ptr_, 0, valid_bits_size); null_count_ = 0; } while ((values_to_read > 0) && column_reader_) { - values_buffer_.Resize(values_to_read * sizeof(ParquetCType)); + RETURN_NOT_OK(values_buffer_.Resize(values_to_read * sizeof(ParquetCType))); if (descr_->max_definition_level() > 0) { - def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); + RETURN_NOT_OK(def_levels_buffer_.Resize(values_to_read * sizeof(int16_t))); } auto reader = dynamic_cast*>(column_reader_.get()); int64_t values_read; @@ -427,16 +419,16 @@ Status FlatColumnReader::Impl::TypedReadBatch<::arrow::BooleanType, BooleanType> if (descr_->max_definition_level() > 0) { valid_bits_buffer_ = std::make_shared(pool_); int valid_bits_size = ::arrow::BitUtil::CeilByte(batch_size) / 8; - valid_bits_buffer_->Resize(valid_bits_size); + RETURN_NOT_OK(valid_bits_buffer_->Resize(valid_bits_size)); valid_bits_ptr_ = valid_bits_buffer_->mutable_data(); memset(valid_bits_ptr_, 0, valid_bits_size); null_count_ = 0; } while ((values_to_read > 0) && column_reader_) { - values_buffer_.Resize(values_to_read * sizeof(bool)); + RETURN_NOT_OK(values_buffer_.Resize(values_to_read * sizeof(bool))); if (descr_->max_definition_level() > 0) { - def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); + RETURN_NOT_OK(def_levels_buffer_.Resize(values_to_read * sizeof(int16_t))); } auto reader = dynamic_cast*>(column_reader_.get()); int64_t values_read; @@ -499,9 +491,9 @@ Status FlatColumnReader::Impl::ReadByteArrayBatch( int values_to_read = batch_size; BuilderType builder(pool_, field_->type); while ((values_to_read > 0) && column_reader_) { - values_buffer_.Resize(values_to_read * sizeof(ByteArray)); + RETURN_NOT_OK(values_buffer_.Resize(values_to_read * sizeof(ByteArray))); if (descr_->max_definition_level() > 0) { - def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); + RETURN_NOT_OK(def_levels_buffer_.Resize(values_to_read * sizeof(int16_t))); } auto reader = dynamic_cast*>(column_reader_.get()); int64_t values_read; diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index c6fc47d87b26f..2602824f08a36 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -22,7 +22,6 @@ #include "parquet/api/reader.h" #include "parquet/api/schema.h" -#include "parquet/arrow/io.h" #include "arrow/io/interfaces.h" @@ -142,7 +141,7 @@ class PARQUET_EXPORT FlatColumnReader { // readable file PARQUET_EXPORT ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, - ParquetAllocator* allocator, std::unique_ptr* reader); + ::arrow::MemoryPool* allocator, std::unique_ptr* reader); } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 3e5e7d960891a..8b2a2abc87596 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -21,7 +21,6 @@ #include #include "parquet/api/schema.h" -#include "parquet/arrow/utils.h" #include "arrow/api.h" diff --git a/cpp/src/parquet/arrow/utils.h b/cpp/src/parquet/arrow/utils.h deleted file mode 100644 index 9c2abfae879a1..0000000000000 --- a/cpp/src/parquet/arrow/utils.h +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_ARROW_UTILS_H -#define PARQUET_ARROW_UTILS_H - -#include - -#include "arrow/status.h" -#include "parquet/exception.h" - -namespace parquet { -namespace arrow { - -#define PARQUET_CATCH_NOT_OK(s) \ - try { \ - (s); \ - } catch (const ::parquet::ParquetException& e) { \ - return ::arrow::Status::IOError(e.what()); \ - } - -#define PARQUET_IGNORE_NOT_OK(s) \ - try { \ - (s); \ - } catch (const ::parquet::ParquetException& e) {} - -#define PARQUET_THROW_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (!_s.ok()) { \ - std::stringstream ss; \ - ss << "Arrow error: " << _s.ToString(); \ - ParquetException::Throw(ss.str()); \ - } \ - } while (0); - -} // namespace arrow -} // namespace parquet - -#endif // PARQUET_ARROW_UTILS_H diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index b7663a3a716c8..f9087ffdb9529 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -22,9 +22,7 @@ #include "parquet/util/logging.h" -#include "parquet/arrow/io.h" #include "parquet/arrow/schema.h" -#include "parquet/arrow/utils.h" #include "arrow/api.h" @@ -371,8 +369,8 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool, Status WriteFlatTable(const Table* table, MemoryPool* pool, const std::shared_ptr<::arrow::io::OutputStream>& sink, int64_t chunk_size, const std::shared_ptr& properties) { - auto parquet_sink = std::make_shared(sink); - return WriteFlatTable(table, pool, parquet_sink, chunk_size, properties); + auto wrapper = std::make_shared(sink); + return WriteFlatTable(table, pool, wrapper, chunk_size, properties); } } // namespace arrow diff --git a/cpp/src/parquet/column/column-io-benchmark.cc b/cpp/src/parquet/column/column-io-benchmark.cc index 3ff9c323c0608..fb491b9d033d4 100644 --- a/cpp/src/parquet/column/column-io-benchmark.cc +++ b/cpp/src/parquet/column/column-io-benchmark.cc @@ -21,7 +21,7 @@ #include "parquet/column/writer.h" #include "parquet/file/reader-internal.h" #include "parquet/file/writer-internal.h" -#include "parquet/util/input.h" +#include "parquet/util/memory.h" namespace parquet { @@ -67,9 +67,9 @@ static void BM_WriteInt64Column(::benchmark::State& state) { properties, schema.get(), reinterpret_cast(&thrift_metadata)); while (state.KeepRunning()) { - InMemoryOutputStream dst; + InMemoryOutputStream stream; std::unique_ptr writer = BuildWriter( - state.range_x(), &dst, metadata.get(), schema.get(), properties.get()); + state.range_x(), &stream, metadata.get(), schema.get(), properties.get()); writer->WriteBatch( values.size(), definition_levels.data(), repetition_levels.data(), values.data()); writer->Close(); @@ -102,14 +102,14 @@ static void BM_ReadInt64Column(::benchmark::State& state) { auto metadata = ColumnChunkMetaDataBuilder::Make( properties, schema.get(), reinterpret_cast(&thrift_metadata)); - InMemoryOutputStream dst; - std::unique_ptr writer = - BuildWriter(state.range_x(), &dst, metadata.get(), schema.get(), properties.get()); + InMemoryOutputStream stream; + std::unique_ptr writer = BuildWriter( + state.range_x(), &stream, metadata.get(), schema.get(), properties.get()); writer->WriteBatch( values.size(), definition_levels.data(), repetition_levels.data(), values.data()); writer->Close(); - std::shared_ptr src = dst.GetBuffer(); + std::shared_ptr src = stream.GetBuffer(); std::vector values_out(state.range_y()); std::vector definition_levels_out(state.range_y()); std::vector repetition_levels_out(state.range_y()); diff --git a/cpp/src/parquet/column/column-reader-test.cc b/cpp/src/parquet/column/column-reader-test.cc index df45e008b8b6c..5b27b734630c8 100644 --- a/cpp/src/parquet/column/column-reader-test.cc +++ b/cpp/src/parquet/column/column-reader-test.cc @@ -214,7 +214,7 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { max_rep_level_ = 0; NodePtr type = schema::Int32("a", Repetition::REQUIRED); const ColumnDescriptor descr(type, max_def_level_, max_rep_level_); - shared_ptr dummy = std::make_shared(); + shared_ptr dummy = std::make_shared(); shared_ptr dict_page = std::make_shared(dummy, 0, Encoding::PLAIN); diff --git a/cpp/src/parquet/column/column-writer-test.cc b/cpp/src/parquet/column/column-writer-test.cc index 5d4daeb1ea2f6..54300054512d3 100644 --- a/cpp/src/parquet/column/column-writer-test.cc +++ b/cpp/src/parquet/column/column-writer-test.cc @@ -26,8 +26,7 @@ #include "parquet/file/writer-internal.h" #include "parquet/types.h" #include "parquet/util/comparison.h" -#include "parquet/util/input.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/column/level-benchmark.cc b/cpp/src/parquet/column/level-benchmark.cc index c511c36830822..8ae2fe18bc4d8 100644 --- a/cpp/src/parquet/column/level-benchmark.cc +++ b/cpp/src/parquet/column/level-benchmark.cc @@ -18,7 +18,7 @@ #include "benchmark/benchmark.h" #include "parquet/column/levels.h" -#include "parquet/util/buffer.h" +#include "parquet/util/memory.h" namespace parquet { @@ -31,7 +31,8 @@ static void BM_RleEncoding(::benchmark::State& state) { [&state, &n] { return (n++ % state.range_y()) == 0; }); int16_t max_level = 1; int64_t rle_size = LevelEncoder::MaxBufferSize(Encoding::RLE, max_level, levels.size()); - auto buffer_rle = std::make_shared(rle_size); + auto buffer_rle = std::make_shared(); + PARQUET_THROW_NOT_OK(buffer_rle->Resize(rle_size)); while (state.KeepRunning()) { LevelEncoder level_encoder; @@ -53,7 +54,8 @@ static void BM_RleDecoding(::benchmark::State& state) { [&state, &n] { return (n++ % state.range_y()) == 0; }); int16_t max_level = 1; int64_t rle_size = LevelEncoder::MaxBufferSize(Encoding::RLE, max_level, levels.size()); - auto buffer_rle = std::make_shared(rle_size + sizeof(uint32_t)); + auto buffer_rle = std::make_shared(); + PARQUET_THROW_NOT_OK(buffer_rle->Resize(rle_size + sizeof(uint32_t))); level_encoder.Init(Encoding::RLE, max_level, levels.size(), buffer_rle->mutable_data() + sizeof(uint32_t), rle_size); level_encoder.Encode(levels.size(), levels.data()); diff --git a/cpp/src/parquet/column/page.h b/cpp/src/parquet/column/page.h index d3954803bd63e..6670e7f84246e 100644 --- a/cpp/src/parquet/column/page.h +++ b/cpp/src/parquet/column/page.h @@ -28,7 +28,7 @@ #include "parquet/column/statistics.h" #include "parquet/types.h" -#include "parquet/util/buffer.h" +#include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/column/properties.h b/cpp/src/parquet/column/properties.h index f5f2fd5c9c33d..cf89226881cc9 100644 --- a/cpp/src/parquet/column/properties.h +++ b/cpp/src/parquet/column/properties.h @@ -25,8 +25,7 @@ #include "parquet/exception.h" #include "parquet/schema/types.h" #include "parquet/types.h" -#include "parquet/util/input.h" -#include "parquet/util/mem-allocator.h" +#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { @@ -46,7 +45,7 @@ class PARQUET_EXPORT ReaderProperties { buffer_size_ = DEFAULT_BUFFER_SIZE; } - MemoryAllocator* allocator() { return allocator_; } + MemoryAllocator* allocator() const { return allocator_; } std::unique_ptr GetStream( RandomAccessSource* source, int64_t start, int64_t num_bytes) { diff --git a/cpp/src/parquet/column/reader.h b/cpp/src/parquet/column/reader.h index d759b969d2c80..bf567d918b084 100644 --- a/cpp/src/parquet/column/reader.h +++ b/cpp/src/parquet/column/reader.h @@ -31,7 +31,7 @@ #include "parquet/exception.h" #include "parquet/schema/descriptor.h" #include "parquet/types.h" -#include "parquet/util/mem-allocator.h" +#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { @@ -221,12 +221,15 @@ inline int64_t TypedColumnReader::Skip(int64_t num_rows_to_skip) { // Jump to the right offset in the Page int64_t batch_size = 1024; // ReadBatch with a smaller memory footprint int64_t values_read = 0; - auto vals = std::make_shared( - batch_size * type_traits::value_byte_size, this->allocator_); - auto def_levels = std::make_shared( - batch_size * sizeof(int16_t), this->allocator_); - auto rep_levels = std::make_shared( - batch_size * sizeof(int16_t), this->allocator_); + + std::shared_ptr vals = AllocateBuffer( + this->allocator_, batch_size * type_traits::value_byte_size); + std::shared_ptr def_levels = + AllocateBuffer(this->allocator_, batch_size * sizeof(int16_t)); + + std::shared_ptr rep_levels = + AllocateBuffer(this->allocator_, batch_size * sizeof(int16_t)); + do { batch_size = std::min(batch_size, rows_to_skip); values_read = diff --git a/cpp/src/parquet/column/scanner.cc b/cpp/src/parquet/column/scanner.cc index 8db3d2bb835db..faf99a0017e1b 100644 --- a/cpp/src/parquet/column/scanner.cc +++ b/cpp/src/parquet/column/scanner.cc @@ -21,6 +21,7 @@ #include #include "parquet/column/reader.h" +#include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/column/scanner.h b/cpp/src/parquet/column/scanner.h index 184c74d71b2c2..13fb01b0781d6 100644 --- a/cpp/src/parquet/column/scanner.h +++ b/cpp/src/parquet/column/scanner.h @@ -29,7 +29,7 @@ #include "parquet/exception.h" #include "parquet/schema/descriptor.h" #include "parquet/types.h" -#include "parquet/util/mem-allocator.h" +#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { @@ -44,7 +44,7 @@ class PARQUET_EXPORT Scanner { : batch_size_(batch_size), level_offset_(0), levels_buffered_(0), - value_buffer_(0, allocator), + value_buffer_(std::make_shared(allocator)), value_offset_(0), values_buffered_(0), reader_(reader) { @@ -76,7 +76,7 @@ class PARQUET_EXPORT Scanner { int level_offset_; int levels_buffered_; - OwnedMutableBuffer value_buffer_; + std::shared_ptr value_buffer_; int value_offset_; int64_t values_buffered_; @@ -95,8 +95,8 @@ class PARQUET_EXPORT TypedScanner : public Scanner { : Scanner(reader, batch_size, allocator) { typed_reader_ = static_cast*>(reader.get()); int value_byte_size = type_traits::value_byte_size; - value_buffer_.Resize(batch_size_ * value_byte_size); - values_ = reinterpret_cast(&value_buffer_[0]); + PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size)); + values_ = reinterpret_cast(value_buffer_->mutable_data()); } virtual ~TypedScanner() {} diff --git a/cpp/src/parquet/column/statistics-test.cc b/cpp/src/parquet/column/statistics-test.cc index 364d9d490bd02..c8641a1558c6d 100644 --- a/cpp/src/parquet/column/statistics-test.cc +++ b/cpp/src/parquet/column/statistics-test.cc @@ -33,9 +33,7 @@ #include "parquet/file/writer.h" #include "parquet/schema/descriptor.h" #include "parquet/types.h" -#include "parquet/util/input.h" -#include "parquet/util/mem-allocator.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" namespace parquet { @@ -150,8 +148,8 @@ class TestRowGroupStatistics : public PrimitiveTypedTest { file_writer->Close(); auto buffer = sink->GetBuffer(); - std::unique_ptr source(new BufferReader(buffer)); - auto file_reader = ParquetFileReader::Open(std::move(source)); + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source); auto rg_reader = file_reader->RowGroup(0); auto column_chunk = rg_reader->metadata()->ColumnChunk(0); std::shared_ptr stats = column_chunk->statistics(); @@ -191,7 +189,8 @@ std::vector TestRowGroupStatistics::GetDeepCopy( std::vector copy; MemoryAllocator* allocator = default_allocator(); for (const FLBA& flba : values) { - uint8_t* ptr = allocator->Malloc(FLBA_LENGTH); + uint8_t* ptr; + PARQUET_THROW_NOT_OK(allocator->Allocate(FLBA_LENGTH, &ptr)); memcpy(ptr, flba.ptr, FLBA_LENGTH); copy.emplace_back(ptr); } @@ -204,7 +203,8 @@ std::vector TestRowGroupStatistics::GetDeepCopy( std::vector copy; MemoryAllocator* allocator = default_allocator(); for (const ByteArray& ba : values) { - uint8_t* ptr = allocator->Malloc(ba.len); + uint8_t* ptr; + PARQUET_THROW_NOT_OK(allocator->Allocate(ba.len, &ptr)); memcpy(ptr, ba.ptr, ba.len); copy.emplace_back(ba.len, ptr); } diff --git a/cpp/src/parquet/column/statistics.cc b/cpp/src/parquet/column/statistics.cc index 0330ac13888f3..9b76fab0fb424 100644 --- a/cpp/src/parquet/column/statistics.cc +++ b/cpp/src/parquet/column/statistics.cc @@ -21,16 +21,17 @@ #include "parquet/column/statistics.h" #include "parquet/encodings/plain-encoding.h" #include "parquet/exception.h" -#include "parquet/util/buffer.h" #include "parquet/util/comparison.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" namespace parquet { template TypedRowGroupStatistics::TypedRowGroupStatistics( const ColumnDescriptor* schema, MemoryAllocator* allocator) - : allocator_(allocator), min_buffer_(0, allocator_), max_buffer_(0, allocator_) { + : allocator_(allocator), + min_buffer_(AllocateBuffer(allocator_, 0)), + max_buffer_(AllocateBuffer(allocator_, 0)) { SetDescr(schema); Reset(); } @@ -40,14 +41,14 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const typename DType::c_ const typename DType::c_type& max, int64_t num_values, int64_t null_count, int64_t distinct_count) : allocator_(default_allocator()), - min_buffer_(0, allocator_), - max_buffer_(0, allocator_) { + min_buffer_(AllocateBuffer(allocator_, 0)), + max_buffer_(AllocateBuffer(allocator_, 0)) { IncrementNumValues(num_values); IncrementNullCount(null_count); IncrementDistinctCount(distinct_count); - Copy(min, &min_, min_buffer_); - Copy(max, &max_, max_buffer_); + Copy(min, &min_, min_buffer_.get()); + Copy(max, &max_, max_buffer_.get()); has_min_max_ = true; } @@ -56,7 +57,9 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const ColumnDescriptor* const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, int64_t distinct_count, bool has_min_max, MemoryAllocator* allocator) - : allocator_(allocator), min_buffer_(0, allocator_), max_buffer_(0, allocator_) { + : allocator_(allocator), + min_buffer_(AllocateBuffer(allocator_, 0)), + max_buffer_(AllocateBuffer(allocator_, 0)) { IncrementNumValues(num_values); IncrementNullCount(null_count); IncrementDistinctCount(distinct_count); @@ -94,11 +97,11 @@ void TypedRowGroupStatistics::Update( auto batch_minmax = std::minmax_element(values, values + num_not_null, compare); if (!has_min_max_) { has_min_max_ = true; - Copy(*batch_minmax.first, &min_, min_buffer_); - Copy(*batch_minmax.second, &max_, max_buffer_); + Copy(*batch_minmax.first, &min_, min_buffer_.get()); + Copy(*batch_minmax.second, &max_, max_buffer_.get()); } else { - Copy(std::min(min_, *batch_minmax.first, compare), &min_, min_buffer_); - Copy(std::max(max_, *batch_minmax.second, compare), &max_, max_buffer_); + Copy(std::min(min_, *batch_minmax.first, compare), &min_, min_buffer_.get()); + Copy(std::max(max_, *batch_minmax.second, compare), &max_, max_buffer_.get()); } } @@ -119,15 +122,15 @@ void TypedRowGroupStatistics::Merge(const TypedRowGroupStatistics& if (!other.HasMinMax()) return; if (!has_min_max_) { - Copy(other.min_, &this->min_, min_buffer_); - Copy(other.max_, &this->max_, max_buffer_); + Copy(other.min_, &this->min_, min_buffer_.get()); + Copy(other.max_, &this->max_, max_buffer_.get()); has_min_max_ = true; return; } Compare compare(descr_); - Copy(std::min(this->min_, other.min_, compare), &this->min_, min_buffer_); - Copy(std::max(this->max_, other.max_, compare), &this->max_, max_buffer_); + Copy(std::min(this->min_, other.min_, compare), &this->min_, min_buffer_.get()); + Copy(std::max(this->max_, other.max_, compare), &this->max_, max_buffer_.get()); } template diff --git a/cpp/src/parquet/column/statistics.h b/cpp/src/parquet/column/statistics.h index a21a0fa83a822..cf41dc0d12c4f 100644 --- a/cpp/src/parquet/column/statistics.h +++ b/cpp/src/parquet/column/statistics.h @@ -24,8 +24,7 @@ #include "parquet/schema/descriptor.h" #include "parquet/types.h" -#include "parquet/util/buffer.h" -#include "parquet/util/mem-allocator.h" +#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { @@ -166,34 +165,33 @@ class TypedRowGroupStatistics : public RowGroupStatistics { void PlainEncode(const T& src, std::string* dst); void PlainDecode(const std::string& src, T* dst); - void Copy(const T& src, T* dst, OwnedMutableBuffer& buffer); + void Copy(const T& src, T* dst, PoolBuffer* buffer); - OwnedMutableBuffer min_buffer_, max_buffer_; + std::shared_ptr min_buffer_, max_buffer_; }; template -inline void TypedRowGroupStatistics::Copy( - const T& src, T* dst, OwnedMutableBuffer&) { +inline void TypedRowGroupStatistics::Copy(const T& src, T* dst, PoolBuffer*) { *dst = src; } template <> inline void TypedRowGroupStatistics::Copy( - const FLBA& src, FLBA* dst, OwnedMutableBuffer& buffer) { + const FLBA& src, FLBA* dst, PoolBuffer* buffer) { if (dst->ptr == src.ptr) return; uint32_t len = descr_->type_length(); - buffer.Resize(len); - std::memcpy(&buffer[0], src.ptr, len); - *dst = FLBA(buffer.data()); + PARQUET_THROW_NOT_OK(buffer->Resize(len)); + std::memcpy(buffer->mutable_data(), src.ptr, len); + *dst = FLBA(buffer->data()); } template <> inline void TypedRowGroupStatistics::Copy( - const ByteArray& src, ByteArray* dst, OwnedMutableBuffer& buffer) { + const ByteArray& src, ByteArray* dst, PoolBuffer* buffer) { if (dst->ptr == src.ptr) return; - buffer.Resize(src.len); - std::memcpy(&buffer[0], src.ptr, src.len); - *dst = ByteArray(src.len, buffer.data()); + PARQUET_THROW_NOT_OK(buffer->Resize(src.len)); + std::memcpy(buffer->mutable_data(), src.ptr, src.len); + *dst = ByteArray(src.len, buffer->data()); } template <> diff --git a/cpp/src/parquet/column/test-util.h b/cpp/src/parquet/column/test-util.h index 10632d293544f..9efa623f2c119 100644 --- a/cpp/src/parquet/column/test-util.h +++ b/cpp/src/parquet/column/test-util.h @@ -28,13 +28,15 @@ #include #include +#include + #include "parquet/column/levels.h" #include "parquet/column/page.h" // Depended on by SerializedPageReader test utilities for now #include "parquet/encodings/dictionary-encoding.h" #include "parquet/encodings/plain-encoding.h" -#include "parquet/util/input.h" +#include "parquet/util/memory.h" #include "parquet/util/test-common.h" using std::vector; @@ -253,8 +255,8 @@ class DictionaryPageBuilder { } shared_ptr WriteDict() { - shared_ptr dict_buffer = - std::make_shared(encoder_->dict_encoded_size()); + std::shared_ptr dict_buffer = + AllocateBuffer(default_allocator(), encoder_->dict_encoded_size()); encoder_->WriteDict(dict_buffer->mutable_data()); return dict_buffer; } @@ -262,7 +264,7 @@ class DictionaryPageBuilder { int32_t num_values() const { return num_dict_values_; } private: - MemPool pool_; + ChunkedAllocator pool_; shared_ptr> encoder_; int32_t num_dict_values_; bool have_values_; diff --git a/cpp/src/parquet/column/writer.cc b/cpp/src/parquet/column/writer.cc index 6112efe217613..7319d4645f6e1 100644 --- a/cpp/src/parquet/column/writer.cc +++ b/cpp/src/parquet/column/writer.cc @@ -21,6 +21,7 @@ #include "parquet/column/statistics.h" #include "parquet/encodings/dictionary-encoding.h" #include "parquet/encodings/plain-encoding.h" +#include "parquet/util/memory.h" namespace parquet { @@ -55,8 +56,8 @@ ColumnWriter::ColumnWriter(ColumnChunkMetaDataBuilder* metadata, } void ColumnWriter::InitSinks() { - definition_levels_sink_.reset(new InMemoryOutputStream()); - repetition_levels_sink_.reset(new InMemoryOutputStream()); + definition_levels_sink_.reset(new InMemoryOutputStream(properties_->allocator())); + repetition_levels_sink_.reset(new InMemoryOutputStream(properties_->allocator())); } void ColumnWriter::WriteDefinitionLevels(int64_t num_levels, const int16_t* levels) { @@ -77,7 +78,8 @@ std::shared_ptr ColumnWriter::RleEncodeLevels( int64_t rle_size = LevelEncoder::MaxBufferSize(Encoding::RLE, max_level, num_buffered_values_) + sizeof(uint32_t); - auto buffer_rle = std::make_shared(rle_size, allocator_); + std::shared_ptr buffer_rle = + AllocateBuffer(properties_->allocator(), rle_size); level_encoder_.Init(Encoding::RLE, max_level, num_buffered_values_, buffer_rle->mutable_data() + sizeof(uint32_t), buffer_rle->size() - sizeof(uint32_t)); @@ -87,7 +89,7 @@ std::shared_ptr ColumnWriter::RleEncodeLevels( reinterpret_cast(buffer_rle->mutable_data())[0] = level_encoder_.len(); int64_t encoded_size = level_encoder_.len() + sizeof(uint32_t); DCHECK(rle_size >= encoded_size); - buffer_rle->Resize(encoded_size); + PARQUET_THROW_NOT_OK(buffer_rle->Resize(encoded_size)); return std::static_pointer_cast(buffer_rle); } @@ -110,8 +112,8 @@ void ColumnWriter::AddDataPage() { definition_levels->size() + repetition_levels->size() + values->size(); // Concatenate data into a single buffer - std::shared_ptr uncompressed_data = - std::make_shared(uncompressed_size, allocator_); + std::shared_ptr uncompressed_data = + AllocateBuffer(allocator_, uncompressed_size); uint8_t* uncompressed_ptr = uncompressed_data->mutable_data(); memcpy(uncompressed_ptr, repetition_levels->data(), repetition_levels->size()); uncompressed_ptr += repetition_levels->size(); @@ -223,7 +225,8 @@ void TypedColumnWriter::CheckDictionarySizeLimit() { template void TypedColumnWriter::WriteDictionaryPage() { auto dict_encoder = static_cast*>(current_encoder_.get()); - auto buffer = std::make_shared(dict_encoder->dict_encoded_size()); + std::shared_ptr buffer = + AllocateBuffer(properties_->allocator(), dict_encoder->dict_encoded_size()); dict_encoder->WriteDict(buffer->mutable_data()); // TODO Get rid of this deep call dict_encoder->mem_pool()->FreeAll(); diff --git a/cpp/src/parquet/column/writer.h b/cpp/src/parquet/column/writer.h index 67a29bc34b6d3..39d5934251f10 100644 --- a/cpp/src/parquet/column/writer.h +++ b/cpp/src/parquet/column/writer.h @@ -28,9 +28,7 @@ #include "parquet/file/metadata.h" #include "parquet/schema/descriptor.h" #include "parquet/types.h" -#include "parquet/util/mem-allocator.h" -#include "parquet/util/mem-pool.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { @@ -111,7 +109,7 @@ class PARQUET_EXPORT ColumnWriter { LevelEncoder level_encoder_; MemoryAllocator* allocator_; - MemPool pool_; + ChunkedAllocator pool_; // The total number of values stored in the data page. This is the maximum of // the number of encoded definition levels or encoded values. For diff --git a/cpp/src/parquet/encodings/decoder.h b/cpp/src/parquet/encodings/decoder.h index 44425070aed18..1ac9f3550af91 100644 --- a/cpp/src/parquet/encodings/decoder.h +++ b/cpp/src/parquet/encodings/decoder.h @@ -22,7 +22,7 @@ #include "parquet/exception.h" #include "parquet/types.h" -#include "parquet/util/mem-allocator.h" +#include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/encodings/delta-bit-pack-encoding.h b/cpp/src/parquet/encodings/delta-bit-pack-encoding.h index 5353817fb56fe..59774a469454b 100644 --- a/cpp/src/parquet/encodings/delta-bit-pack-encoding.h +++ b/cpp/src/parquet/encodings/delta-bit-pack-encoding.h @@ -24,7 +24,7 @@ #include "parquet/encodings/decoder.h" #include "parquet/util/bit-stream-utils.inline.h" -#include "parquet/util/buffer.h" +#include "parquet/util/memory.h" namespace parquet { @@ -36,7 +36,7 @@ class DeltaBitPackDecoder : public Decoder { explicit DeltaBitPackDecoder( const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) : Decoder(descr, Encoding::DELTA_BINARY_PACKED), - delta_bit_widths_(0, allocator) { + delta_bit_widths_(new PoolBuffer(allocator)) { if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) { throw ParquetException("Delta bit pack encoding should only be for integer data."); } @@ -62,28 +62,31 @@ class DeltaBitPackDecoder : public Decoder { if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException(); if (!decoder_.GetVlqInt(&values_current_block_)) { ParquetException::EofException(); } if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException(); - delta_bit_widths_.Resize(num_mini_blocks_); + PARQUET_THROW_NOT_OK(delta_bit_widths_->Resize(num_mini_blocks_)); + + uint8_t* bit_width_data = delta_bit_widths_->mutable_data(); if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException(); for (int i = 0; i < num_mini_blocks_; ++i) { - if (!decoder_.GetAligned(1, &delta_bit_widths_[i])) { + if (!decoder_.GetAligned(1, bit_width_data + i)) { ParquetException::EofException(); } } values_per_mini_block_ = block_size / num_mini_blocks_; mini_block_idx_ = 0; - delta_bit_width_ = delta_bit_widths_[0]; + delta_bit_width_ = bit_width_data[0]; values_current_mini_block_ = values_per_mini_block_; } template int GetInternal(T* buffer, int max_values) { max_values = std::min(max_values, num_values_); + const uint8_t* bit_width_data = delta_bit_widths_->data(); for (int i = 0; i < max_values; ++i) { if (UNLIKELY(values_current_mini_block_ == 0)) { ++mini_block_idx_; - if (mini_block_idx_ < static_cast(delta_bit_widths_.size())) { - delta_bit_width_ = delta_bit_widths_[mini_block_idx_]; + if (mini_block_idx_ < static_cast(delta_bit_widths_->size())) { + delta_bit_width_ = bit_width_data[mini_block_idx_]; values_current_mini_block_ = values_per_mini_block_; } else { InitBlock(); @@ -112,7 +115,7 @@ class DeltaBitPackDecoder : public Decoder { int32_t min_delta_; size_t mini_block_idx_; - OwnedMutableBuffer delta_bit_widths_; + std::unique_ptr delta_bit_widths_; int delta_bit_width_; int32_t last_value_; diff --git a/cpp/src/parquet/encodings/dictionary-encoding.h b/cpp/src/parquet/encodings/dictionary-encoding.h index 7823307dc9d0c..b79744af5f56a 100644 --- a/cpp/src/parquet/encodings/dictionary-encoding.h +++ b/cpp/src/parquet/encodings/dictionary-encoding.h @@ -27,11 +27,9 @@ #include "parquet/encodings/decoder.h" #include "parquet/encodings/encoder.h" #include "parquet/encodings/plain-encoding.h" -#include "parquet/util/buffer.h" #include "parquet/util/cpu-info.h" #include "parquet/util/hash-util.h" -#include "parquet/util/mem-allocator.h" -#include "parquet/util/mem-pool.h" +#include "parquet/util/memory.h" #include "parquet/util/rle-encoding.h" namespace parquet { @@ -48,7 +46,7 @@ class DictionaryDecoder : public Decoder { const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) : Decoder(descr, Encoding::RLE_DICTIONARY), dictionary_(0, allocator), - byte_array_data_(0, allocator) {} + byte_array_data_(AllocateBuffer(allocator, 0)) {} // Perform type-specific initiatialization void SetDict(Decoder* dictionary); @@ -78,7 +76,7 @@ class DictionaryDecoder : public Decoder { // Data that contains the byte array data (byte_array_dictionary_ just has the // pointers). - OwnedMutableBuffer byte_array_data_; + std::shared_ptr byte_array_data_; RleDecoder idx_decoder_; }; @@ -106,11 +104,13 @@ inline void DictionaryDecoder::SetDict( for (int i = 0; i < num_dictionary_values; ++i) { total_size += dictionary_[i].len; } - byte_array_data_.Resize(total_size); + PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size)); int offset = 0; + + uint8_t* bytes_data = byte_array_data_->mutable_data(); for (int i = 0; i < num_dictionary_values; ++i) { - memcpy(&byte_array_data_[offset], dictionary_[i].ptr, dictionary_[i].len); - dictionary_[i].ptr = &byte_array_data_[offset]; + memcpy(bytes_data + offset, dictionary_[i].ptr, dictionary_[i].len); + dictionary_[i].ptr = bytes_data + offset; offset += dictionary_[i].len; } } @@ -124,11 +124,12 @@ inline void DictionaryDecoder::SetDict(Decoder* dictionary) int fixed_len = descr_->type_length(); int total_size = num_dictionary_values * fixed_len; - byte_array_data_.Resize(total_size); + PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size)); + uint8_t* bytes_data = byte_array_data_->mutable_data(); int offset = 0; for (int i = 0; i < num_dictionary_values; ++i) { - memcpy(&byte_array_data_[offset], dictionary_[i].ptr, fixed_len); - dictionary_[i].ptr = &byte_array_data_[offset]; + memcpy(bytes_data + offset, dictionary_[i].ptr, fixed_len); + dictionary_[i].ptr = bytes_data + offset; offset += fixed_len; } } @@ -158,7 +159,7 @@ class DictEncoder : public Encoder { public: typedef typename DType::c_type T; - explicit DictEncoder(const ColumnDescriptor* desc, MemPool* pool = nullptr, + explicit DictEncoder(const ColumnDescriptor* desc, ChunkedAllocator* pool = nullptr, MemoryAllocator* allocator = default_allocator()) : Encoder(desc, Encoding::PLAIN_DICTIONARY, allocator), allocator_(allocator), @@ -176,7 +177,7 @@ class DictEncoder : public Encoder { // TODO(wesm): think about how to address the construction semantics in // encodings/dictionary-encoding.h - void set_mem_pool(MemPool* pool) { pool_ = pool; } + void set_mem_pool(ChunkedAllocator* pool) { pool_ = pool; } void set_type_length(int type_length) { type_length_ = type_length; } @@ -215,11 +216,11 @@ class DictEncoder : public Encoder { void Put(const T& value); std::shared_ptr FlushValues() override { - auto buffer = std::make_shared( - EstimatedDataEncodedSize(), this->allocator_); + std::shared_ptr buffer = + AllocateBuffer(this->allocator_, EstimatedDataEncodedSize()); int result_size = WriteIndices(buffer->mutable_data(), EstimatedDataEncodedSize()); ClearIndices(); - buffer->Resize(result_size); + PARQUET_THROW_NOT_OK(buffer->Resize(result_size)); return buffer; }; @@ -233,7 +234,7 @@ class DictEncoder : public Encoder { /// dict_encoded_size() bytes. void WriteDict(uint8_t* buffer); - MemPool* mem_pool() { return pool_; } + ChunkedAllocator* mem_pool() { return pool_; } /// The number of entries in the dictionary. int num_entries() const { return uniques_.size(); } @@ -242,7 +243,7 @@ class DictEncoder : public Encoder { MemoryAllocator* allocator_; // For ByteArray / FixedLenByteArray data. Not owned - MemPool* pool_; + ChunkedAllocator* pool_; /// Size of the table. Must be a power of 2. int hash_table_size_; diff --git a/cpp/src/parquet/encodings/encoder.h b/cpp/src/parquet/encodings/encoder.h index a325ab5773258..c51f8d550933e 100644 --- a/cpp/src/parquet/encodings/encoder.h +++ b/cpp/src/parquet/encodings/encoder.h @@ -23,12 +23,11 @@ #include "parquet/exception.h" #include "parquet/types.h" +#include "parquet/util/memory.h" namespace parquet { -class Buffer; class ColumnDescriptor; -class OutputStream; // Base class for value encoders. Since encoders may or not have state (e.g., // dictionary encoding) we use a class instance to maintain any state. diff --git a/cpp/src/parquet/encodings/encoding-benchmark.cc b/cpp/src/parquet/encodings/encoding-benchmark.cc index e62d758650bde..516e453d3c2d4 100644 --- a/cpp/src/parquet/encodings/encoding-benchmark.cc +++ b/cpp/src/parquet/encodings/encoding-benchmark.cc @@ -19,7 +19,7 @@ #include "parquet/encodings/dictionary-encoding.h" #include "parquet/file/reader-internal.h" -#include "parquet/util/mem-pool.h" +#include "parquet/util/memory.h" namespace parquet { @@ -101,23 +101,25 @@ static void DecodeDict( typedef typename Type::c_type T; int num_values = values.size(); - MemPool pool; + ChunkedAllocator pool; MemoryAllocator* allocator = default_allocator(); std::shared_ptr descr = Int64Schema(Repetition::REQUIRED); - std::shared_ptr dict_buffer = - std::make_shared(); - auto indices = std::make_shared(); DictEncoder encoder(descr.get(), &pool, allocator); for (int i = 0; i < num_values; ++i) { encoder.Put(values[i]); } - dict_buffer->Resize(encoder.dict_encoded_size()); + std::shared_ptr dict_buffer = + AllocateBuffer(allocator, encoder.dict_encoded_size()); + + std::shared_ptr indices = + AllocateBuffer(allocator, encoder.EstimatedDataEncodedSize()); + encoder.WriteDict(dict_buffer->mutable_data()); - indices->Resize(encoder.EstimatedDataEncodedSize()); int actual_bytes = encoder.WriteIndices(indices->mutable_data(), indices->size()); - indices->Resize(actual_bytes); + + PARQUET_THROW_NOT_OK(indices->Resize(actual_bytes)); while (state.KeepRunning()) { PlainDecoder dict_decoder(descr.get()); diff --git a/cpp/src/parquet/encodings/encoding-test.cc b/cpp/src/parquet/encodings/encoding-test.cc index daa25cbc8804b..eccfc5d13ba36 100644 --- a/cpp/src/parquet/encodings/encoding-test.cc +++ b/cpp/src/parquet/encodings/encoding-test.cc @@ -28,8 +28,7 @@ #include "parquet/schema/types.h" #include "parquet/types.h" #include "parquet/util/bit-util.h" -#include "parquet/util/buffer.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" #include "parquet/util/test-common.h" using std::string; @@ -178,7 +177,7 @@ class TestEncodingBase : public ::testing::Test { } protected: - MemPool pool_; + ChunkedAllocator pool_; MemoryAllocator* allocator_; int num_values_; @@ -250,10 +249,9 @@ class TestDictionaryEncoding : public TestEncodingBase { void CheckRoundtrip() { DictEncoder encoder(descr_.get(), &pool_); - dict_buffer_ = std::make_shared(); - ASSERT_NO_THROW(encoder.Put(draws_, num_values_)); - dict_buffer_->Resize(encoder.dict_encoded_size()); + dict_buffer_ = AllocateBuffer(default_allocator(), encoder.dict_encoded_size()); + encoder.WriteDict(dict_buffer_->mutable_data()); std::shared_ptr indices = encoder.FlushValues(); @@ -277,7 +275,7 @@ class TestDictionaryEncoding : public TestEncodingBase { protected: USING_BASE_MEMBERS(); - std::shared_ptr dict_buffer_; + std::shared_ptr dict_buffer_; }; TYPED_TEST_CASE(TestDictionaryEncoding, DictEncodedTypes); diff --git a/cpp/src/parquet/encodings/plain-encoding.h b/cpp/src/parquet/encodings/plain-encoding.h index a3d7b69b10706..d2127ef7e1ffd 100644 --- a/cpp/src/parquet/encodings/plain-encoding.h +++ b/cpp/src/parquet/encodings/plain-encoding.h @@ -25,8 +25,7 @@ #include "parquet/encodings/encoder.h" #include "parquet/schema/descriptor.h" #include "parquet/util/bit-stream-utils.inline.h" -#include "parquet/util/buffer.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" namespace parquet { @@ -163,8 +162,9 @@ class PlainEncoder : public Encoder { explicit PlainEncoder( const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) - : Encoder(descr, Encoding::PLAIN, allocator), - values_sink_(new InMemoryOutputStream(IN_MEMORY_DEFAULT_CAPACITY, allocator)) {} + : Encoder(descr, Encoding::PLAIN, allocator) { + values_sink_.reset(new InMemoryOutputStream(allocator)); + } int64_t EstimatedDataEncodedSize() override { return values_sink_->Tell(); } @@ -172,7 +172,7 @@ class PlainEncoder : public Encoder { void Put(const T* src, int num_values) override; protected: - std::shared_ptr values_sink_; + std::unique_ptr values_sink_; }; template <> @@ -181,10 +181,10 @@ class PlainEncoder : public Encoder { explicit PlainEncoder( const ColumnDescriptor* descr, MemoryAllocator* allocator = default_allocator()) : Encoder(descr, Encoding::PLAIN, allocator), - bits_available_(IN_MEMORY_DEFAULT_CAPACITY * 8), - bits_buffer_(IN_MEMORY_DEFAULT_CAPACITY, allocator), - values_sink_(new InMemoryOutputStream(IN_MEMORY_DEFAULT_CAPACITY, allocator)) { - bit_writer_.reset(new BitWriter(bits_buffer_.mutable_data(), bits_buffer_.size())); + bits_available_(kInMemoryDefaultCapacity * 8), + bits_buffer_(AllocateBuffer(allocator, kInMemoryDefaultCapacity)), + values_sink_(new InMemoryOutputStream(allocator)) { + bit_writer_.reset(new BitWriter(bits_buffer_->mutable_data(), bits_buffer_->size())); } int64_t EstimatedDataEncodedSize() override { @@ -196,12 +196,11 @@ class PlainEncoder : public Encoder { bit_writer_->Flush(); values_sink_->Write(bit_writer_->buffer(), bit_writer_->bytes_written()); bit_writer_->Clear(); - bits_available_ = bits_buffer_.size() * 8; + bits_available_ = bits_buffer_->size() * 8; } std::shared_ptr buffer = values_sink_->GetBuffer(); - values_sink_.reset( - new InMemoryOutputStream(IN_MEMORY_DEFAULT_CAPACITY, this->allocator_)); + values_sink_.reset(new InMemoryOutputStream(this->allocator_)); return buffer; } @@ -225,7 +224,7 @@ class PlainEncoder : public Encoder { \ int bits_remaining = num_values - bit_offset; \ while (bit_offset < num_values) { \ - bits_available_ = bits_buffer_.size() * 8; \ + bits_available_ = bits_buffer_->size() * 8; \ \ int bits_to_write = std::min(bits_available_, bits_remaining); \ for (int i = bit_offset; i < bit_offset + bits_to_write; i++) { \ @@ -249,15 +248,14 @@ class PlainEncoder : public Encoder { protected: int bits_available_; std::unique_ptr bit_writer_; - OwnedMutableBuffer bits_buffer_; - std::shared_ptr values_sink_; + std::shared_ptr bits_buffer_; + std::unique_ptr values_sink_; }; template inline std::shared_ptr PlainEncoder::FlushValues() { std::shared_ptr buffer = values_sink_->GetBuffer(); - values_sink_.reset( - new InMemoryOutputStream(IN_MEMORY_DEFAULT_CAPACITY, this->allocator_)); + values_sink_.reset(new InMemoryOutputStream(this->allocator_)); return buffer; } diff --git a/cpp/src/parquet/file/file-deserialize-test.cc b/cpp/src/parquet/file/file-deserialize-test.cc index 52878858bbca3..fbb511a31ae1c 100644 --- a/cpp/src/parquet/file/file-deserialize-test.cc +++ b/cpp/src/parquet/file/file-deserialize-test.cc @@ -33,12 +33,13 @@ #include "parquet/thrift/parquet_types.h" #include "parquet/thrift/util.h" #include "parquet/types.h" -#include "parquet/util/input.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" #include "parquet/util/test-common.h" namespace parquet { +using ::arrow::io::BufferReader; + // Adds page statistics occupying a certain amount of bytes (for testing very // large page headers) static inline void AddDummyStats(int stat_size, format::DataPageHeader& data_page) { @@ -234,11 +235,13 @@ TEST_F(TestPageSerde, LZONotSupported) { class TestParquetFileReader : public ::testing::Test { public: void AssertInvalidFileThrows(const std::shared_ptr& buffer) { - std::unique_ptr reader(new BufferReader(buffer)); reader_.reset(new ParquetFileReader()); + auto reader = std::make_shared(buffer); + auto wrapper = std::unique_ptr(new ArrowInputFile(reader)); + ASSERT_THROW( - reader_->Open(SerializedFile::Open(std::move(reader))), ParquetException); + reader_->Open(SerializedFile::Open(std::move(wrapper))), ParquetException); } protected: diff --git a/cpp/src/parquet/file/file-serialize-test.cc b/cpp/src/parquet/file/file-serialize-test.cc index 3a11cd8294423..7a90eebe1544c 100644 --- a/cpp/src/parquet/file/file-serialize-test.cc +++ b/cpp/src/parquet/file/file-serialize-test.cc @@ -24,8 +24,7 @@ #include "parquet/file/reader.h" #include "parquet/file/writer.h" #include "parquet/types.h" -#include "parquet/util/input.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" namespace parquet { @@ -75,8 +74,9 @@ class TestSerialize : public PrimitiveTypedTest { file_writer->Close(); auto buffer = sink->GetBuffer(); - std::unique_ptr source(new BufferReader(buffer)); - auto file_reader = ParquetFileReader::Open(std::move(source)); + + auto source = std::make_shared<::arrow::io::BufferReader>(buffer); + auto file_reader = ParquetFileReader::Open(source); ASSERT_EQ(num_columns_, file_reader->metadata()->num_columns()); ASSERT_EQ(1, file_reader->metadata()->num_row_groups()); ASSERT_EQ(100, file_reader->metadata()->num_rows()); diff --git a/cpp/src/parquet/file/metadata.cc b/cpp/src/parquet/file/metadata.cc index adfcb699e5a7d..692a0f5eb9959 100644 --- a/cpp/src/parquet/file/metadata.cc +++ b/cpp/src/parquet/file/metadata.cc @@ -23,6 +23,7 @@ #include "parquet/file/metadata.h" #include "parquet/schema/converter.h" #include "parquet/thrift/util.h" +#include "parquet/util/memory.h" #include diff --git a/cpp/src/parquet/file/metadata.h b/cpp/src/parquet/file/metadata.h index c5dd03a60d100..ef19c98417a60 100644 --- a/cpp/src/parquet/file/metadata.h +++ b/cpp/src/parquet/file/metadata.h @@ -27,7 +27,7 @@ #include "parquet/compression/codec.h" #include "parquet/schema/descriptor.h" #include "parquet/types.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { diff --git a/cpp/src/parquet/file/reader-internal.cc b/cpp/src/parquet/file/reader-internal.cc index 37c790ce1ab43..2c3ebb32a5542 100644 --- a/cpp/src/parquet/file/reader-internal.cc +++ b/cpp/src/parquet/file/reader-internal.cc @@ -32,8 +32,7 @@ #include "parquet/schema/types.h" #include "parquet/thrift/util.h" #include "parquet/types.h" -#include "parquet/util/buffer.h" -#include "parquet/util/input.h" +#include "parquet/util/memory.h" namespace parquet { @@ -44,7 +43,7 @@ namespace parquet { SerializedPageReader::SerializedPageReader(std::unique_ptr stream, int64_t total_num_rows, Compression::type codec_type, MemoryAllocator* allocator) : stream_(std::move(stream)), - decompression_buffer_(0, allocator), + decompression_buffer_(AllocateBuffer(allocator, 0)), seen_num_rows_(0), total_num_rows_(total_num_rows) { max_page_header_size_ = DEFAULT_MAX_PAGE_HEADER_SIZE; @@ -97,12 +96,12 @@ std::shared_ptr SerializedPageReader::NextPage() { // Uncompress it if we need to if (decompressor_ != NULL) { // Grow the uncompressed buffer if we need to. - if (uncompressed_len > static_cast(decompression_buffer_.size())) { - decompression_buffer_.Resize(uncompressed_len); + if (uncompressed_len > static_cast(decompression_buffer_->size())) { + PARQUET_THROW_NOT_OK(decompression_buffer_->Resize(uncompressed_len)); } - decompressor_->Decompress( - compressed_len, buffer, uncompressed_len, &decompression_buffer_[0]); - buffer = &decompression_buffer_[0]; + decompressor_->Decompress(compressed_len, buffer, uncompressed_len, + decompression_buffer_->mutable_data()); + buffer = decompression_buffer_->data(); } auto page_buffer = std::make_shared(buffer, uncompressed_len); @@ -207,7 +206,7 @@ static constexpr uint32_t FOOTER_SIZE = 8; static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'}; std::unique_ptr SerializedFile::Open( - std::unique_ptr source, ReaderProperties props) { + std::unique_ptr source, const ReaderProperties& props) { std::unique_ptr result( new SerializedFile(std::move(source), props)); @@ -239,39 +238,40 @@ const FileMetaData* SerializedFile::metadata() const { } SerializedFile::SerializedFile(std::unique_ptr source, - ReaderProperties props = default_reader_properties()) + const ReaderProperties& props = default_reader_properties()) : source_(std::move(source)), properties_(props) {} void SerializedFile::ParseMetaData() { - int64_t filesize = source_->Size(); + int64_t file_size = source_->Size(); - if (filesize < FOOTER_SIZE) { + if (file_size < FOOTER_SIZE) { throw ParquetException("Corrupted file, smaller than file footer"); } uint8_t footer_buffer[FOOTER_SIZE]; - source_->Seek(filesize - FOOTER_SIZE); + source_->Seek(file_size - FOOTER_SIZE); int64_t bytes_read = source_->Read(FOOTER_SIZE, footer_buffer); if (bytes_read != FOOTER_SIZE || memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) { throw ParquetException("Invalid parquet file. Corrupt footer."); } uint32_t metadata_len = *reinterpret_cast(footer_buffer); - int64_t metadata_start = filesize - FOOTER_SIZE - metadata_len; - if (FOOTER_SIZE + metadata_len > filesize) { + int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len; + if (FOOTER_SIZE + metadata_len > file_size) { throw ParquetException( "Invalid parquet file. File is less than " "file metadata size."); } source_->Seek(metadata_start); - OwnedMutableBuffer metadata_buffer(metadata_len, properties_.allocator()); - bytes_read = source_->Read(metadata_len, &metadata_buffer[0]); + std::shared_ptr metadata_buffer = + AllocateBuffer(properties_.allocator(), metadata_len); + bytes_read = source_->Read(metadata_len, metadata_buffer->mutable_data()); if (bytes_read != metadata_len) { throw ParquetException("Invalid parquet file. Could not read metadata bytes."); } - file_metadata_ = FileMetaData::Make(&metadata_buffer[0], &metadata_len); + file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len); } } // namespace parquet diff --git a/cpp/src/parquet/file/reader-internal.h b/cpp/src/parquet/file/reader-internal.h index 582ab3583a355..aa9b75e737890 100644 --- a/cpp/src/parquet/file/reader-internal.h +++ b/cpp/src/parquet/file/reader-internal.h @@ -29,7 +29,7 @@ #include "parquet/file/reader.h" #include "parquet/thrift/parquet_types.h" #include "parquet/types.h" -#include "parquet/util/input.h" +#include "parquet/util/memory.h" namespace parquet { @@ -62,7 +62,7 @@ class SerializedPageReader : public PageReader { // Compression codec to use. std::unique_ptr decompressor_; - OwnedMutableBuffer decompression_buffer_; + std::shared_ptr decompression_buffer_; // Maximum allowed page size uint32_t max_page_header_size_; @@ -104,7 +104,7 @@ class SerializedFile : public ParquetFileReader::Contents { // lifetime separately static std::unique_ptr Open( std::unique_ptr source, - ReaderProperties props = default_reader_properties()); + const ReaderProperties& props = default_reader_properties()); virtual void Close(); virtual std::shared_ptr GetRowGroup(int i); virtual const FileMetaData* metadata() const; @@ -113,7 +113,7 @@ class SerializedFile : public ParquetFileReader::Contents { private: // This class takes ownership of the provided data source explicit SerializedFile( - std::unique_ptr source, ReaderProperties props); + std::unique_ptr source, const ReaderProperties& props); std::unique_ptr source_; std::unique_ptr file_metadata_; diff --git a/cpp/src/parquet/file/reader.cc b/cpp/src/parquet/file/reader.cc index 06d2d8ea894bf..52fe57a5883a1 100644 --- a/cpp/src/parquet/file/reader.cc +++ b/cpp/src/parquet/file/reader.cc @@ -24,14 +24,16 @@ #include #include +#include "arrow/io/file.h" + #include "parquet/column/page.h" #include "parquet/column/reader.h" #include "parquet/column/scanner.h" #include "parquet/exception.h" #include "parquet/file/reader-internal.h" #include "parquet/types.h" -#include "parquet/util/input.h" #include "parquet/util/logging.h" +#include "parquet/util/memory.h" using std::string; using std::vector; @@ -69,26 +71,36 @@ ParquetFileReader::~ParquetFileReader() { } std::unique_ptr ParquetFileReader::Open( - std::unique_ptr source, ReaderProperties props) { - auto contents = SerializedFile::Open(std::move(source), props); + const std::shared_ptr<::arrow::io::ReadableFileInterface>& source, + const ReaderProperties& props) { + std::unique_ptr io_wrapper(new ArrowInputFile(source)); + return Open(std::move(io_wrapper), props); +} +std::unique_ptr ParquetFileReader::Open( + std::unique_ptr source, const ReaderProperties& props) { + auto contents = SerializedFile::Open(std::move(source), props); std::unique_ptr result(new ParquetFileReader()); result->Open(std::move(contents)); - return result; } std::unique_ptr ParquetFileReader::OpenFile( - const std::string& path, bool memory_map, ReaderProperties props) { - std::unique_ptr file; + const std::string& path, bool memory_map, const ReaderProperties& props) { + std::shared_ptr<::arrow::io::ReadableFileInterface> source; if (memory_map) { - file.reset(new MemoryMapSource(props.allocator())); + std::shared_ptr<::arrow::io::ReadableFile> handle; + PARQUET_THROW_NOT_OK( + ::arrow::io::ReadableFile::Open(path, props.allocator(), &handle)); + source = handle; } else { - file.reset(new LocalFileSource(props.allocator())); + std::shared_ptr<::arrow::io::MemoryMappedFile> handle; + PARQUET_THROW_NOT_OK( + ::arrow::io::MemoryMappedFile::Open(path, ::arrow::io::FileMode::READ, &handle)); + source = handle; } - file->Open(path); - return Open(std::move(file), props); + return Open(source, props); } void ParquetFileReader::Open(std::unique_ptr contents) { diff --git a/cpp/src/parquet/file/reader.h b/cpp/src/parquet/file/reader.h index ca28f67db01a4..1c245069cb2ad 100644 --- a/cpp/src/parquet/file/reader.h +++ b/cpp/src/parquet/file/reader.h @@ -30,12 +30,12 @@ #include "parquet/column/statistics.h" #include "parquet/file/metadata.h" #include "parquet/schema/descriptor.h" +#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { class ColumnReader; -class RandomAccessSource; class PARQUET_EXPORT RowGroupReader { public: @@ -79,15 +79,27 @@ class PARQUET_EXPORT ParquetFileReader { ParquetFileReader(); ~ParquetFileReader(); + // Create a reader from some implementation of parquet-cpp's generic file + // input interface + // + // If you cannot provide exclusive access to your file resource, create a + // subclass of RandomAccessSource that wraps the shared resource + static std::unique_ptr Open( + std::unique_ptr source, + const ReaderProperties& props = default_reader_properties()); + + // Create a file reader instance from an Arrow file object. Thread-safety is + // the responsibility of the file implementation + static std::unique_ptr Open( + const std::shared_ptr<::arrow::io::ReadableFileInterface>& source, + const ReaderProperties& props = default_reader_properties()); + // API Convenience to open a serialized Parquet file on disk, using built-in IO // interface implementations that were created for testing, and may not be robust for // all use cases. static std::unique_ptr OpenFile(const std::string& path, - bool memory_map = true, ReaderProperties props = default_reader_properties()); - - static std::unique_ptr Open( - std::unique_ptr source, - ReaderProperties props = default_reader_properties()); + bool memory_map = true, + const ReaderProperties& props = default_reader_properties()); void Open(std::unique_ptr contents); void Close(); diff --git a/cpp/src/parquet/file/writer-internal.cc b/cpp/src/parquet/file/writer-internal.cc index c4681bd2afcb5..48884adc1719c 100644 --- a/cpp/src/parquet/file/writer-internal.cc +++ b/cpp/src/parquet/file/writer-internal.cc @@ -20,7 +20,7 @@ #include "parquet/column/writer.h" #include "parquet/schema/converter.h" #include "parquet/thrift/util.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" using parquet::schema::GroupNode; using parquet::schema::SchemaFlattener; @@ -37,6 +37,7 @@ SerializedPageWriter::SerializedPageWriter(OutputStream* sink, Compression::type ColumnChunkMetaDataBuilder* metadata, MemoryAllocator* allocator) : sink_(sink), metadata_(metadata), + allocator_(allocator), num_values_(0), dictionary_page_offset_(0), data_page_offset_(0), @@ -71,10 +72,13 @@ std::shared_ptr SerializedPageWriter::Compress( // Compress the data int64_t max_compressed_size = compressor_->MaxCompressedLen(buffer->size(), buffer->data()); - auto compression_buffer = std::make_shared(max_compressed_size); + + std::shared_ptr compression_buffer = + AllocateBuffer(allocator_, max_compressed_size); + int64_t compressed_size = compressor_->Compress(buffer->size(), buffer->data(), max_compressed_size, compression_buffer->mutable_data()); - compression_buffer->Resize(compressed_size); + PARQUET_THROW_NOT_OK(compression_buffer->Resize(compressed_size)); return compression_buffer; } @@ -182,7 +186,7 @@ void RowGroupSerializer::Close() { // FileSerializer std::unique_ptr FileSerializer::Open( - std::shared_ptr sink, const std::shared_ptr& schema, + const std::shared_ptr& sink, const std::shared_ptr& schema, const std::shared_ptr& properties) { std::unique_ptr result( new FileSerializer(sink, schema, properties)); @@ -248,7 +252,7 @@ void FileSerializer::WriteMetaData() { sink_->Write(PARQUET_MAGIC, 4); } -FileSerializer::FileSerializer(std::shared_ptr sink, +FileSerializer::FileSerializer(const std::shared_ptr& sink, const std::shared_ptr& schema, const std::shared_ptr& properties) : sink_(sink), diff --git a/cpp/src/parquet/file/writer-internal.h b/cpp/src/parquet/file/writer-internal.h index f1f76ab2ae08f..81a0837732035 100644 --- a/cpp/src/parquet/file/writer-internal.h +++ b/cpp/src/parquet/file/writer-internal.h @@ -26,6 +26,7 @@ #include "parquet/file/metadata.h" #include "parquet/file/writer.h" #include "parquet/thrift/parquet_types.h" +#include "parquet/util/memory.h" namespace parquet { @@ -54,6 +55,7 @@ class SerializedPageWriter : public PageWriter { private: OutputStream* sink_; ColumnChunkMetaDataBuilder* metadata_; + MemoryAllocator* allocator_; int64_t num_values_; int64_t dictionary_page_offset_; int64_t data_page_offset_; @@ -102,7 +104,7 @@ class RowGroupSerializer : public RowGroupWriter::Contents { class FileSerializer : public ParquetFileWriter::Contents { public: static std::unique_ptr Open( - std::shared_ptr sink, + const std::shared_ptr& sink, const std::shared_ptr& schema, const std::shared_ptr& properties = default_writer_properties()); @@ -119,7 +121,7 @@ class FileSerializer : public ParquetFileWriter::Contents { virtual ~FileSerializer(); private: - explicit FileSerializer(std::shared_ptr sink, + explicit FileSerializer(const std::shared_ptr& sink, const std::shared_ptr& schema, const std::shared_ptr& properties); diff --git a/cpp/src/parquet/file/writer.cc b/cpp/src/parquet/file/writer.cc index 8c9f52f0f0c3a..a381c226a48d2 100644 --- a/cpp/src/parquet/file/writer.cc +++ b/cpp/src/parquet/file/writer.cc @@ -18,7 +18,7 @@ #include "parquet/file/writer.h" #include "parquet/file/writer-internal.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" using parquet::schema::GroupNode; @@ -51,13 +51,19 @@ ParquetFileWriter::~ParquetFileWriter() { } std::unique_ptr ParquetFileWriter::Open( - std::shared_ptr sink, const std::shared_ptr& schema, + const std::shared_ptr<::arrow::io::OutputStream>& sink, + const std::shared_ptr& schema, const std::shared_ptr& properties) { - auto contents = FileSerializer::Open(sink, schema, properties); + return Open(std::make_shared(sink), schema, properties); +} +std::unique_ptr ParquetFileWriter::Open( + const std::shared_ptr& sink, + const std::shared_ptr& schema, + const std::shared_ptr& properties) { + auto contents = FileSerializer::Open(sink, schema, properties); std::unique_ptr result(new ParquetFileWriter()); result->Open(std::move(contents)); - return result; } diff --git a/cpp/src/parquet/file/writer.h b/cpp/src/parquet/file/writer.h index e82f0166a9f88..6d7161b85628d 100644 --- a/cpp/src/parquet/file/writer.h +++ b/cpp/src/parquet/file/writer.h @@ -24,7 +24,7 @@ #include "parquet/column/properties.h" #include "parquet/schema/descriptor.h" #include "parquet/schema/types.h" -#include "parquet/util/mem-allocator.h" +#include "parquet/util/memory.h" #include "parquet/util/visibility.h" namespace parquet { @@ -97,7 +97,13 @@ class PARQUET_EXPORT ParquetFileWriter { ParquetFileWriter(); ~ParquetFileWriter(); - static std::unique_ptr Open(std::shared_ptr sink, + static std::unique_ptr Open( + const std::shared_ptr<::arrow::io::OutputStream>& sink, + const std::shared_ptr& schema, + const std::shared_ptr& properties = default_writer_properties()); + + static std::unique_ptr Open( + const std::shared_ptr& sink, const std::shared_ptr& schema, const std::shared_ptr& properties = default_writer_properties()); diff --git a/cpp/src/parquet/reader-test.cc b/cpp/src/parquet/reader-test.cc index d21a809530667..e3be9b0ae592b 100644 --- a/cpp/src/parquet/reader-test.cc +++ b/cpp/src/parquet/reader-test.cc @@ -23,17 +23,20 @@ #include #include +#include "arrow/io/file.h" + #include "parquet/column/reader.h" #include "parquet/column/scanner.h" #include "parquet/file/reader-internal.h" #include "parquet/file/reader.h" -#include "parquet/util/input.h" -#include "parquet/util/mem-allocator.h" +#include "parquet/util/memory.h" using std::string; namespace parquet { +using ReadableFile = ::arrow::io::ReadableFile; + const char* data_dir = std::getenv("PARQUET_TEST_DATA"); std::string alltypes_plain() { @@ -159,7 +162,7 @@ TEST_F(TestAllTypesPlain, ColumnSelectionOutOfRange) { ASSERT_THROW(reader_->DebugPrint(ss, columns), ParquetException); } -class TestLocalFileSource : public ::testing::Test { +class TestLocalFile : public ::testing::Test { public: void SetUp() { std::string dir_string(data_dir); @@ -168,24 +171,25 @@ class TestLocalFileSource : public ::testing::Test { ss << dir_string << "/" << "alltypes_plain.parquet"; - file.reset(new LocalFileSource()); - file->Open(ss.str()); + PARQUET_THROW_NOT_OK(ReadableFile::Open(ss.str(), &handle)); + fileno = handle->file_descriptor(); } void TearDown() {} protected: - std::unique_ptr file; + int fileno; + std::shared_ptr<::arrow::io::ReadableFile> handle; }; -TEST_F(TestLocalFileSource, FileClosedOnDestruction) { - int file_desc = file->file_descriptor(); +TEST_F(TestLocalFile, FileClosedOnDestruction) { { - auto contents = SerializedFile::Open(std::move(file)); + auto contents = SerializedFile::Open( + std::unique_ptr(new ArrowInputFile(handle))); std::unique_ptr result(new ParquetFileReader()); result->Open(std::move(contents)); } - ASSERT_EQ(-1, fcntl(file_desc, F_GETFD)); + ASSERT_EQ(-1, fcntl(fileno, F_GETFD)); ASSERT_EQ(EBADF, errno); } diff --git a/cpp/src/parquet/thrift/util.h b/cpp/src/parquet/thrift/util.h index 180043592b2f5..9d2b66fbe3b72 100644 --- a/cpp/src/parquet/thrift/util.h +++ b/cpp/src/parquet/thrift/util.h @@ -37,7 +37,7 @@ #include "parquet/exception.h" #include "parquet/thrift/parquet_types.h" #include "parquet/util/logging.h" -#include "parquet/util/output.h" +#include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/util/CMakeLists.txt b/cpp/src/parquet/util/CMakeLists.txt index 3a4b1c9caf28d..7a9ccbadd7f89 100644 --- a/cpp/src/parquet/util/CMakeLists.txt +++ b/cpp/src/parquet/util/CMakeLists.txt @@ -20,17 +20,13 @@ install(FILES bit-stream-utils.h bit-stream-utils.inline.h bit-util.h - buffer.h buffer-builder.h compiler-util.h cpu-info.h hash-util.h - input.h logging.h macros.h - mem-allocator.h - mem-pool.h - output.h + memory.h rle-encoding.h stopwatch.h sse-util.h @@ -70,9 +66,6 @@ if (PARQUET_BUILD_BENCHMARKS) endif() ADD_PARQUET_TEST(bit-util-test) -ADD_PARQUET_TEST(buffer-test) ADD_PARQUET_TEST(comparison-test) -ADD_PARQUET_TEST(input-output-test) -ADD_PARQUET_TEST(mem-allocator-test) -ADD_PARQUET_TEST(mem-pool-test) +ADD_PARQUET_TEST(memory-test) ADD_PARQUET_TEST(rle-test) diff --git a/cpp/src/parquet/util/buffer-test.cc b/cpp/src/parquet/util/buffer-test.cc deleted file mode 100644 index ee5b00023f4be..0000000000000 --- a/cpp/src/parquet/util/buffer-test.cc +++ /dev/null @@ -1,65 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "parquet/exception.h" -#include "parquet/util/buffer.h" - -using std::string; - -namespace parquet { - -class TestBuffer : public ::testing::Test {}; - -TEST_F(TestBuffer, Resize) { - OwnedMutableBuffer buf; - - ASSERT_EQ(0, buf.size()); - ASSERT_NO_THROW(buf.Resize(100)); - ASSERT_EQ(100, buf.size()); - ASSERT_NO_THROW(buf.Resize(200)); - ASSERT_EQ(200, buf.size()); - - // Make it smaller, too - ASSERT_NO_THROW(buf.Resize(50)); - ASSERT_EQ(50, buf.size()); -} - -TEST_F(TestBuffer, ResizeOOM) { -// Tests that deliberately throw Exceptions foul up valgrind and report -// red herring memory leaks -#ifndef PARQUET_VALGRIND - OwnedMutableBuffer buf; - ASSERT_NO_THROW(buf.Resize(100)); - int64_t to_alloc = std::numeric_limits::max(); - try { - buf.Resize(to_alloc); - FAIL() << "Exception not thrown"; - } catch (const ParquetException& e) { - // pass - } catch (const std::exception& e) { FAIL() << "Different exception thrown"; } -#endif -} - -} // namespace parquet diff --git a/cpp/src/parquet/util/buffer.cc b/cpp/src/parquet/util/buffer.cc deleted file mode 100644 index 0b7100c8f3b77..0000000000000 --- a/cpp/src/parquet/util/buffer.cc +++ /dev/null @@ -1,123 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/util/buffer.h" - -#include -#include - -#include "parquet/exception.h" -#include "parquet/types.h" - -namespace parquet { - -Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size) { - data_ = parent->data() + offset; - size_ = size; - parent_ = parent; -} - -std::shared_ptr MutableBuffer::GetImmutableView() { - return std::make_shared(this->get_shared_ptr(), 0, size()); -} - -OwnedMutableBuffer::OwnedMutableBuffer(int64_t size, MemoryAllocator* allocator) - : ResizableBuffer(nullptr, 0), allocator_(allocator) { - Resize(size); -} - -OwnedMutableBuffer::~OwnedMutableBuffer() { - if (mutable_data_) { allocator_->Free(mutable_data_, capacity_); } -} - -void OwnedMutableBuffer::Reserve(int64_t new_capacity) { - if (!mutable_data_ || new_capacity > capacity_) { - if (mutable_data_) { - uint8_t* new_data = allocator_->Malloc(new_capacity); - memcpy(new_data, mutable_data_, size_); - allocator_->Free(mutable_data_, capacity_); - mutable_data_ = new_data; - } else { - mutable_data_ = allocator_->Malloc(new_capacity); - } - data_ = mutable_data_; - capacity_ = new_capacity; - } -} - -void OwnedMutableBuffer::Resize(int64_t new_size) { - Reserve(new_size); - size_ = new_size; -} - -uint8_t& OwnedMutableBuffer::operator[](int64_t i) { - return mutable_data_[i]; -} - -template -Vector::Vector(int64_t size, MemoryAllocator* allocator) - : buffer_(new OwnedMutableBuffer(size * sizeof(T), allocator)), - size_(size), - capacity_(size) { - if (size > 0) { - data_ = reinterpret_cast(buffer_->mutable_data()); - } else { - data_ = nullptr; - } -} - -template -void Vector::Reserve(int64_t new_capacity) { - if (new_capacity > capacity_) { - buffer_->Resize(new_capacity * sizeof(T)); - data_ = reinterpret_cast(buffer_->mutable_data()); - capacity_ = new_capacity; - } -} - -template -void Vector::Resize(int64_t new_size) { - Reserve(new_size); - size_ = new_size; -} - -template -void Vector::Assign(int64_t size, const T val) { - Resize(size); - for (int64_t i = 0; i < size_; i++) { - data_[i] = val; - } -} - -template -void Vector::Swap(Vector& v) { - buffer_.swap(v.buffer_); - std::swap(size_, v.size_); - std::swap(capacity_, v.capacity_); - std::swap(data_, v.data_); -} - -template class Vector; -template class Vector; -template class Vector; -template class Vector; -template class Vector; -template class Vector; -template class Vector; -template class Vector; - -} // namespace parquet diff --git a/cpp/src/parquet/util/buffer.h b/cpp/src/parquet/util/buffer.h deleted file mode 100644 index 58a5f5e2783d2..0000000000000 --- a/cpp/src/parquet/util/buffer.h +++ /dev/null @@ -1,149 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_UTIL_BUFFER_H -#define PARQUET_UTIL_BUFFER_H - -#include -#include -#include -#include -#include - -#include "parquet/util/macros.h" -#include "parquet/util/mem-allocator.h" -#include "parquet/util/visibility.h" - -namespace parquet { - -// ---------------------------------------------------------------------- -// Buffer classes - -// Immutable API for a chunk of bytes which may or may not be owned by the -// class instance -class PARQUET_EXPORT Buffer : public std::enable_shared_from_this { - public: - Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} - - // An offset into data that is owned by another buffer, but we want to be - // able to retain a valid pointer to it even after other shared_ptr's to the - // parent buffer have been destroyed - Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size); - - std::shared_ptr get_shared_ptr() { return shared_from_this(); } - - // Return true if both buffers are the same size and contain the same bytes - // up to the number of compared bytes - bool Equals(const Buffer& other, int64_t nbytes) const { - return this == &other || (size_ >= nbytes && other.size_ >= nbytes && - !memcmp(data_, other.data_, nbytes)); - } - - bool Equals(const Buffer& other) const { - return this == &other || (size_ == other.size_ && !memcmp(data_, other.data_, size_)); - } - - const uint8_t* data() const { return data_; } - - int64_t size() const { return size_; } - - // Returns true if this Buffer is referencing memory (possibly) owned by some - // other buffer - bool is_shared() const { return static_cast(parent_); } - - const std::shared_ptr parent() const { return parent_; } - - protected: - const uint8_t* data_; - int64_t size_; - - // nullptr by default, but may be set - std::shared_ptr parent_; - - private: - DISALLOW_COPY_AND_ASSIGN(Buffer); -}; - -// A Buffer whose contents can be mutated. May or may not own its data. -class PARQUET_EXPORT MutableBuffer : public Buffer { - public: - MutableBuffer(uint8_t* data, int64_t size) : Buffer(data, size) { - mutable_data_ = data; - } - - uint8_t* mutable_data() { return mutable_data_; } - - // Get a read-only view of this buffer - std::shared_ptr GetImmutableView(); - - protected: - MutableBuffer() : Buffer(nullptr, 0), mutable_data_(nullptr) {} - - uint8_t* mutable_data_; -}; - -class PARQUET_EXPORT ResizableBuffer : public MutableBuffer { - public: - virtual void Resize(int64_t new_size) = 0; - - protected: - ResizableBuffer(uint8_t* data, int64_t size) - : MutableBuffer(data, size), capacity_(size) {} - int64_t capacity_; -}; - -// A ResizableBuffer whose memory is owned by the class instance. For example, -// for reading data out of files that you want to deallocate when this class is -// garbage-collected -class PARQUET_EXPORT OwnedMutableBuffer : public ResizableBuffer { - public: - explicit OwnedMutableBuffer( - int64_t size = 0, MemoryAllocator* allocator = default_allocator()); - virtual ~OwnedMutableBuffer(); - void Resize(int64_t new_size) override; - void Reserve(int64_t new_capacity); - uint8_t& operator[](int64_t i); - - private: - // TODO: aligned allocations - MemoryAllocator* allocator_; - - DISALLOW_COPY_AND_ASSIGN(OwnedMutableBuffer); -}; - -template -class Vector { - public: - explicit Vector(int64_t size, MemoryAllocator* allocator); - void Resize(int64_t new_size); - void Reserve(int64_t new_capacity); - void Assign(int64_t size, const T val); - void Swap(Vector& v); - inline T& operator[](int64_t i) const { return data_[i]; } - - private: - std::unique_ptr buffer_; - int64_t size_; - int64_t capacity_; - T* data_; - - DISALLOW_COPY_AND_ASSIGN(Vector); -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_BUFFER_H diff --git a/cpp/src/parquet/util/input-output-test.cc b/cpp/src/parquet/util/input-output-test.cc deleted file mode 100644 index 72aad9ce1e1bc..0000000000000 --- a/cpp/src/parquet/util/input-output-test.cc +++ /dev/null @@ -1,244 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "parquet/exception.h" -#include "parquet/util/buffer.h" -#include "parquet/util/input.h" -#include "parquet/util/mem-allocator.h" -#include "parquet/util/output.h" -#include "parquet/util/test-common.h" - -namespace parquet { - -TEST(TestBufferedInputStream, Basics) { - int64_t source_size = 256; - int64_t stream_offset = 10; - int64_t stream_size = source_size - stream_offset; - int64_t chunk_size = 50; - auto buf = std::make_shared(source_size); - ASSERT_EQ(source_size, buf->size()); - for (int i = 0; i < source_size; i++) { - buf->mutable_data()[i] = i; - } - - std::unique_ptr source(new BufferReader(buf)); - std::unique_ptr allocator(new TrackingAllocator()); - std::unique_ptr stream(new BufferedInputStream( - allocator.get(), chunk_size, source.get(), stream_offset, stream_size)); - - const uint8_t* output; - int64_t bytes_read; - - // source is at offset 10 - output = stream->Peek(10, &bytes_read); - ASSERT_EQ(10, bytes_read); - for (int i = 0; i < 10; i++) { - ASSERT_EQ(10 + i, output[i]) << i; - } - output = stream->Read(10, &bytes_read); - ASSERT_EQ(10, bytes_read); - for (int i = 0; i < 10; i++) { - ASSERT_EQ(10 + i, output[i]) << i; - } - output = stream->Read(10, &bytes_read); - ASSERT_EQ(10, bytes_read); - for (int i = 0; i < 10; i++) { - ASSERT_EQ(20 + i, output[i]) << i; - } - stream->Advance(5); - stream->Advance(5); - // source is at offset 40 - // read across buffer boundary. buffer size is 50 - output = stream->Read(20, &bytes_read); - ASSERT_EQ(20, bytes_read); - for (int i = 0; i < 20; i++) { - ASSERT_EQ(40 + i, output[i]) << i; - } - // read more than original chunk_size - output = stream->Read(60, &bytes_read); - ASSERT_EQ(60, bytes_read); - for (int i = 0; i < 60; i++) { - ASSERT_EQ(60 + i, output[i]) << i; - } - - stream->Advance(120); - // source is at offset 240 - // read outside of source boundary. source size is 256 - output = stream->Read(30, &bytes_read); - ASSERT_EQ(16, bytes_read); - for (int i = 0; i < 16; i++) { - ASSERT_EQ(240 + i, output[i]) << i; - } -} - -TEST(TestInMemoryOutputStream, Basics) { - std::unique_ptr stream(new InMemoryOutputStream(8)); - - std::vector data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - - stream->Write(&data[0], 4); - ASSERT_EQ(4, stream->Tell()); - stream->Write(&data[4], data.size() - 4); - - std::shared_ptr buffer = stream->GetBuffer(); - - Buffer data_buf(data.data(), data.size()); - - ASSERT_TRUE(data_buf.Equals(*buffer)); -} - -TEST(TestBufferedReader, Basics) { - std::vector data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - auto buffer = std::make_shared(data.data(), data.size()); - BufferReader reader(buffer); - - uint8_t out[4]; - ASSERT_EQ(4, reader.Read(4, out)); - ASSERT_EQ(4, reader.Tell()); - ASSERT_EQ(0, out[0]); - ASSERT_EQ(1, out[1]); - ASSERT_EQ(2, out[2]); - ASSERT_EQ(3, out[3]); - - reader.Seek(8); - ASSERT_EQ(8, reader.Tell()); - - auto out_buffer = reader.Read(5); - ASSERT_EQ(8, out_buffer->data()[0]); - ASSERT_EQ(9, out_buffer->data()[1]); - ASSERT_EQ(10, out_buffer->data()[2]); - ASSERT_EQ(11, out_buffer->data()[3]); - ASSERT_EQ(12, out_buffer->data()[4]); - - // Read past the end of the buffer - ASSERT_EQ(13, reader.Tell()); - ASSERT_EQ(0, reader.Read(4, out)); - ASSERT_EQ(0, reader.Read(4)->size()); - - reader.Close(); -} - -static bool file_exists(const std::string& path) { - return std::ifstream(path.c_str()).good(); -} - -template -class TestFileReaders : public ::testing::Test { - public: - void SetUp() { - test_path_ = "parquet-input-output-test.txt"; - if (file_exists(test_path_)) { std::remove(test_path_.c_str()); } - test_data_ = "testingdata"; - - std::ofstream stream; - stream.open(test_path_.c_str()); - stream << test_data_; - filesize_ = test_data_.size(); - } - - void TearDown() { DeleteTestFile(); } - - void DeleteTestFile() { - if (file_exists(test_path_)) { std::remove(test_path_.c_str()); } - } - - protected: - ReaderType source; - std::string test_path_; - std::string test_data_; - int filesize_; -}; - -typedef ::testing::Types ReaderTypes; - -TYPED_TEST_CASE(TestFileReaders, ReaderTypes); - -TYPED_TEST(TestFileReaders, NonExistentFile) { - ASSERT_THROW(this->source.Open("0xDEADBEEF.txt"), ParquetException); -} - -TYPED_TEST(TestFileReaders, Read) { - this->source.Open(this->test_path_); - - ASSERT_EQ(this->filesize_, this->source.Size()); - - std::shared_ptr buffer = this->source.Read(4); - ASSERT_EQ(4, buffer->size()); - ASSERT_EQ(0, memcmp(this->test_data_.c_str(), buffer->data(), 4)); - - // Read past EOF - buffer = this->source.Read(10); - ASSERT_EQ(7, buffer->size()); - ASSERT_EQ(0, memcmp(this->test_data_.c_str() + 4, buffer->data(), 7)); -} - -TYPED_TEST(TestFileReaders, FileDisappeared) { - this->source.Open(this->test_path_); - this->source.Seek(4); - this->DeleteTestFile(); - this->source.Close(); -} - -TYPED_TEST(TestFileReaders, BadSeek) { - this->source.Open(this->test_path_); - ASSERT_THROW(this->source.Seek(this->filesize_ + 1), ParquetException); -} - -class TestFileWriter : public ::testing::Test { - public: - void SetUp() { - test_path_ = "parquet-input-output-test.txt"; - if (file_exists(test_path_)) { std::remove(test_path_.c_str()); } - } - - void TearDown() { DeleteTestFile(); } - - void DeleteTestFile() { - if (file_exists(test_path_)) { std::remove(test_path_.c_str()); } - } - - protected: - std::string test_path_; - uint8_t test_data_[4] = {1, 2, 3, 4}; -}; - -TEST_F(TestFileWriter, Write) { - LocalFileOutputStream sink(test_path_); - ASSERT_EQ(0, sink.Tell()); - sink.Write(test_data_, 4); - ASSERT_EQ(4, sink.Tell()); - sink.Close(); - - // Check that the correct content was written - LocalFileSource source; - source.Open(test_path_); - std::shared_ptr buffer = source.Read(4); - ASSERT_EQ(4, buffer->size()); - ASSERT_EQ(0, memcmp(test_data_, buffer->data(), 4)); -} - -} // namespace parquet diff --git a/cpp/src/parquet/util/input.cc b/cpp/src/parquet/util/input.cc deleted file mode 100644 index 127b90c09b6ef..0000000000000 --- a/cpp/src/parquet/util/input.cc +++ /dev/null @@ -1,285 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/util/input.h" - -#include -#include -#include -#include - -#include "parquet/exception.h" -#include "parquet/util/buffer.h" -#include "parquet/util/logging.h" - -namespace parquet { - -// ---------------------------------------------------------------------- -// RandomAccessSource - -std::shared_ptr RandomAccessSource::ReadAt(int64_t pos, int64_t nbytes) { - Seek(pos); - return Read(nbytes); -} - -int64_t RandomAccessSource::Size() const { - return size_; -} - -// ---------------------------------------------------------------------- -// LocalFileSource - -LocalFileSource::~LocalFileSource() { - CloseFile(); -} - -void LocalFileSource::Open(const std::string& path) { - path_ = path; - file_ = fopen(path_.c_str(), "rb"); - if (file_ == nullptr || ferror(file_)) { - std::stringstream ss; - ss << "Unable to open file: " << path; - throw ParquetException(ss.str()); - } - is_open_ = true; - SeekFile(0, SEEK_END); - size_ = LocalFileSource::Tell(); - Seek(0); -} - -void LocalFileSource::SeekFile(int64_t pos, int origin) { - if (origin == SEEK_SET && (pos < 0 || pos >= size_)) { - std::stringstream ss; - ss << "Position " << pos << " is not in range."; - throw ParquetException(ss.str()); - } - - if (0 != fseek(file_, pos, origin)) { - std::stringstream ss; - ss << "File seek to position " << pos << " failed."; - throw ParquetException(ss.str()); - } -} - -void LocalFileSource::Close() { - // Pure virtual - CloseFile(); -} - -void LocalFileSource::CloseFile() { - if (is_open_) { - fclose(file_); - is_open_ = false; - } -} - -void LocalFileSource::Seek(int64_t pos) { - SeekFile(pos); -} - -int64_t LocalFileSource::Tell() const { - int64_t position = ftell(file_); - if (position < 0) { throw ParquetException("ftell failed, did the file disappear?"); } - return position; -} - -int LocalFileSource::file_descriptor() const { - return fileno(file_); -} - -int64_t LocalFileSource::Read(int64_t nbytes, uint8_t* buffer) { - return fread(buffer, 1, nbytes, file_); -} - -std::shared_ptr LocalFileSource::Read(int64_t nbytes) { - auto result = std::make_shared(0, allocator_); - result->Resize(nbytes); - - int64_t bytes_read = Read(nbytes, result->mutable_data()); - if (bytes_read < nbytes) { result->Resize(bytes_read); } - return result; -} -// ---------------------------------------------------------------------- -// MemoryMapSource methods - -MemoryMapSource::~MemoryMapSource() { - CloseFile(); -} - -void MemoryMapSource::Open(const std::string& path) { - LocalFileSource::Open(path); - data_ = reinterpret_cast( - mmap(nullptr, size_, PROT_READ, MAP_SHARED, fileno(file_), 0)); - if (data_ == nullptr) { throw ParquetException("Memory mapping file failed"); } - pos_ = 0; -} - -void MemoryMapSource::Close() { - // Pure virtual - CloseFile(); -} - -void MemoryMapSource::CloseFile() { - if (data_ != nullptr) { - munmap(data_, size_); - data_ = nullptr; - } - - LocalFileSource::CloseFile(); -} - -void MemoryMapSource::Seek(int64_t pos) { - if (pos < 0 || pos >= size_) { - std::stringstream ss; - ss << "Position " << pos << " is not in range."; - throw ParquetException(ss.str()); - } - - pos_ = pos; -} - -int64_t MemoryMapSource::Tell() const { - return pos_; -} - -int64_t MemoryMapSource::Read(int64_t nbytes, uint8_t* buffer) { - int64_t bytes_available = std::min(nbytes, size_ - pos_); - memcpy(buffer, data_ + pos_, bytes_available); - pos_ += bytes_available; - return bytes_available; -} - -std::shared_ptr MemoryMapSource::Read(int64_t nbytes) { - int64_t bytes_available = std::min(nbytes, size_ - pos_); - auto result = std::make_shared(data_ + pos_, bytes_available); - pos_ += bytes_available; - return result; -} - -// ---------------------------------------------------------------------- -// BufferReader - -BufferReader::BufferReader(const std::shared_ptr& buffer) - : buffer_(buffer), data_(buffer->data()), pos_(0) { - size_ = buffer->size(); -} - -int64_t BufferReader::Tell() const { - return pos_; -} - -void BufferReader::Seek(int64_t pos) { - if (pos < 0 || pos >= size_) { - std::stringstream ss; - ss << "Cannot seek to " << pos << "File is length " << size_; - throw ParquetException(ss.str()); - } - pos_ = pos; -} - -int64_t BufferReader::Read(int64_t nbytes, uint8_t* out) { - int64_t bytes_available = std::min(nbytes, size_ - pos_); - memcpy(out, Head(), bytes_available); - pos_ += bytes_available; - return bytes_available; -} - -std::shared_ptr BufferReader::Read(int64_t nbytes) { - int64_t bytes_available = std::min(nbytes, size_ - pos_); - auto result = std::make_shared(Head(), bytes_available); - pos_ += bytes_available; - return result; -} - -// ---------------------------------------------------------------------- -// InMemoryInputStream - -InMemoryInputStream::InMemoryInputStream(const std::shared_ptr& buffer) - : buffer_(buffer), offset_(0) { - len_ = buffer_->size(); -} - -InMemoryInputStream::InMemoryInputStream( - RandomAccessSource* source, int64_t start, int64_t num_bytes) - : offset_(0) { - buffer_ = source->ReadAt(start, num_bytes); - if (buffer_->size() < num_bytes) { - throw ParquetException("Unable to read column chunk data"); - } - len_ = buffer_->size(); -} - -const uint8_t* InMemoryInputStream::Peek(int64_t num_to_peek, int64_t* num_bytes) { - *num_bytes = std::min(static_cast(num_to_peek), len_ - offset_); - return buffer_->data() + offset_; -} - -const uint8_t* InMemoryInputStream::Read(int64_t num_to_read, int64_t* num_bytes) { - const uint8_t* result = Peek(num_to_read, num_bytes); - offset_ += *num_bytes; - return result; -} - -void InMemoryInputStream::Advance(int64_t num_bytes) { - offset_ += num_bytes; -} - -// ---------------------------------------------------------------------- -// BufferedInputStream -BufferedInputStream::BufferedInputStream(MemoryAllocator* pool, int64_t buffer_size, - RandomAccessSource* source, int64_t start, int64_t num_bytes) - : source_(source), stream_offset_(start), stream_end_(start + num_bytes) { - buffer_ = std::make_shared(buffer_size, pool); - buffer_size_ = buffer_->size(); - // Required to force a lazy read - buffer_offset_ = buffer_size_; -} - -const uint8_t* BufferedInputStream::Peek(int64_t num_to_peek, int64_t* num_bytes) { - *num_bytes = std::min(num_to_peek, stream_end_ - stream_offset_); - // increase the buffer size if needed - if (*num_bytes > buffer_size_) { - buffer_->Resize(*num_bytes); - buffer_size_ = buffer_->size(); - DCHECK(buffer_size_ >= *num_bytes); - } - // Read more data when buffer has insufficient left or when resized - if (*num_bytes > (buffer_size_ - buffer_offset_)) { - source_->Seek(stream_offset_); - buffer_size_ = std::min(buffer_size_, stream_end_ - stream_offset_); - int64_t bytes_read = source_->Read(buffer_size_, buffer_->mutable_data()); - if (bytes_read < *num_bytes) { - throw ParquetException("Failed reading column data from source"); - } - buffer_offset_ = 0; - } - return buffer_->data() + buffer_offset_; -} - -const uint8_t* BufferedInputStream::Read(int64_t num_to_read, int64_t* num_bytes) { - const uint8_t* result = Peek(num_to_read, num_bytes); - stream_offset_ += *num_bytes; - buffer_offset_ += *num_bytes; - return result; -} - -void BufferedInputStream::Advance(int64_t num_bytes) { - stream_offset_ += num_bytes; - buffer_offset_ += num_bytes; -} - -} // namespace parquet diff --git a/cpp/src/parquet/util/input.h b/cpp/src/parquet/util/input.h deleted file mode 100644 index 1bb41e38e7d09..0000000000000 --- a/cpp/src/parquet/util/input.h +++ /dev/null @@ -1,211 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_UTIL_INPUT_H -#define PARQUET_UTIL_INPUT_H - -#include -#include -#include -#include -#include - -#include "parquet/util/mem-allocator.h" -#include "parquet/util/visibility.h" - -namespace parquet { - -class Buffer; -class OwnedMutableBuffer; - -// ---------------------------------------------------------------------- -// Random access input (e.g. file-like) - -// Random -class PARQUET_EXPORT RandomAccessSource { - public: - virtual ~RandomAccessSource() {} - - virtual void Close() = 0; - virtual int64_t Tell() const = 0; - virtual void Seek(int64_t pos) = 0; - int64_t Size() const; - - // Returns actual number of bytes read - virtual int64_t Read(int64_t nbytes, uint8_t* out) = 0; - - virtual std::shared_ptr Read(int64_t nbytes) = 0; - std::shared_ptr ReadAt(int64_t pos, int64_t nbytes); - - protected: - int64_t size_; -}; - -// ---------------------------------------------------------------------- -// Implementations of RandomAccessSource used for testing and internal CLI tools. -// May not be sufficiently robust for general production use. - -class PARQUET_EXPORT LocalFileSource : public RandomAccessSource { - public: - explicit LocalFileSource(MemoryAllocator* allocator = default_allocator()) - : file_(nullptr), is_open_(false), allocator_(allocator) {} - - virtual ~LocalFileSource(); - - virtual void Open(const std::string& path); - - virtual void Close(); - virtual int64_t Tell() const; - virtual void Seek(int64_t pos); - - // Returns actual number of bytes read - virtual int64_t Read(int64_t nbytes, uint8_t* out); - - virtual std::shared_ptr Read(int64_t nbytes); - - bool is_open() const { return is_open_; } - const std::string& path() const { return path_; } - - // Return the integer file descriptor - int file_descriptor() const; - - protected: - void CloseFile(); - void SeekFile(int64_t pos, int origin = SEEK_SET); - - std::string path_; - FILE* file_; - bool is_open_; - MemoryAllocator* allocator_; -}; - -class PARQUET_EXPORT MemoryMapSource : public LocalFileSource { - public: - explicit MemoryMapSource(MemoryAllocator* allocator = default_allocator()) - : LocalFileSource(allocator), data_(nullptr), pos_(0) {} - - virtual ~MemoryMapSource(); - - virtual void Close(); - virtual void Open(const std::string& path); - - virtual int64_t Tell() const; - virtual void Seek(int64_t pos); - - // Copy data from memory map into out (must be already allocated memory) - // @returns: actual number of bytes read - virtual int64_t Read(int64_t nbytes, uint8_t* out); - - // Return a buffer referencing memory-map (no copy) - virtual std::shared_ptr Read(int64_t nbytes); - - private: - void CloseFile(); - - uint8_t* data_; - int64_t pos_; -}; - -// ---------------------------------------------------------------------- -// A file-like object that reads from virtual address space - -class PARQUET_EXPORT BufferReader : public RandomAccessSource { - public: - explicit BufferReader(const std::shared_ptr& buffer); - virtual void Close() {} - virtual int64_t Tell() const; - virtual void Seek(int64_t pos); - - virtual int64_t Read(int64_t nbytes, uint8_t* out); - - virtual std::shared_ptr Read(int64_t nbytes); - - protected: - const uint8_t* Head() { return data_ + pos_; } - - std::shared_ptr buffer_; - const uint8_t* data_; - int64_t pos_; -}; - -// ---------------------------------------------------------------------- -// Streaming input interfaces - -// Interface for the column reader to get the bytes. The interface is a stream -// interface, meaning the bytes in order and once a byte is read, it does not -// need to be read again. -class InputStream { - public: - // Returns the next 'num_to_peek' without advancing the current position. - // *num_bytes will contain the number of bytes returned which can only be - // less than num_to_peek at end of stream cases. - // Since the position is not advanced, calls to this function are idempotent. - // The buffer returned to the caller is still owned by the input stream and must - // stay valid until the next call to Peek() or Read(). - virtual const uint8_t* Peek(int64_t num_to_peek, int64_t* num_bytes) = 0; - - // Identical to Peek(), except the current position in the stream is advanced by - // *num_bytes. - virtual const uint8_t* Read(int64_t num_to_read, int64_t* num_bytes) = 0; - - // Advance the stream without reading - virtual void Advance(int64_t num_bytes) = 0; - - virtual ~InputStream() {} - - protected: - InputStream() {} -}; - -// Implementation of an InputStream when all the bytes are in memory. -class InMemoryInputStream : public InputStream { - public: - InMemoryInputStream(RandomAccessSource* source, int64_t start, int64_t end); - explicit InMemoryInputStream(const std::shared_ptr& buffer); - virtual const uint8_t* Peek(int64_t num_to_peek, int64_t* num_bytes); - virtual const uint8_t* Read(int64_t num_to_read, int64_t* num_bytes); - - virtual void Advance(int64_t num_bytes); - - private: - std::shared_ptr buffer_; - int64_t len_; - int64_t offset_; -}; - -// Implementation of an InputStream when only some of the bytes are in memory. -class BufferedInputStream : public InputStream { - public: - BufferedInputStream(MemoryAllocator* pool, int64_t buffer_size, - RandomAccessSource* source, int64_t start, int64_t end); - virtual const uint8_t* Peek(int64_t num_to_peek, int64_t* num_bytes); - virtual const uint8_t* Read(int64_t num_to_read, int64_t* num_bytes); - - virtual void Advance(int64_t num_bytes); - - private: - std::shared_ptr buffer_; - RandomAccessSource* source_; - int64_t stream_offset_; - int64_t stream_end_; - int64_t buffer_offset_; - int64_t buffer_size_; -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_INPUT_H diff --git a/cpp/src/parquet/util/mem-allocator-test.cc b/cpp/src/parquet/util/mem-allocator-test.cc deleted file mode 100644 index 336d3b45de45f..0000000000000 --- a/cpp/src/parquet/util/mem-allocator-test.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include - -#include "parquet/exception.h" -#include "parquet/util/mem-allocator.h" - -namespace parquet { - -TEST(TestAllocator, AllocateFree) { - TrackingAllocator allocator; - - uint8_t* data = allocator.Malloc(100); - ASSERT_TRUE(nullptr != data); - data[99] = 55; - allocator.Free(data, 100); - - data = allocator.Malloc(0); - ASSERT_EQ(nullptr, data); - allocator.Free(data, 0); - - data = allocator.Malloc(1); - ASSERT_THROW(allocator.Free(data, 2), ParquetException); - ASSERT_NO_THROW(allocator.Free(data, 1)); - - int64_t to_alloc = std::numeric_limits::max(); - ASSERT_THROW(allocator.Malloc(to_alloc), ParquetException); -} - -TEST(TestAllocator, TotalMax) { - TrackingAllocator allocator; - ASSERT_EQ(0, allocator.TotalMemory()); - ASSERT_EQ(0, allocator.MaxMemory()); - - uint8_t* data = allocator.Malloc(100); - ASSERT_EQ(100, allocator.TotalMemory()); - ASSERT_EQ(100, allocator.MaxMemory()); - - uint8_t* data2 = allocator.Malloc(10); - ASSERT_EQ(110, allocator.TotalMemory()); - ASSERT_EQ(110, allocator.MaxMemory()); - - allocator.Free(data, 100); - ASSERT_EQ(10, allocator.TotalMemory()); - ASSERT_EQ(110, allocator.MaxMemory()); - - allocator.Free(data2, 10); - ASSERT_EQ(0, allocator.TotalMemory()); - ASSERT_EQ(110, allocator.MaxMemory()); -} - -} // namespace parquet diff --git a/cpp/src/parquet/util/mem-allocator.cc b/cpp/src/parquet/util/mem-allocator.cc deleted file mode 100644 index 2b6592d052493..0000000000000 --- a/cpp/src/parquet/util/mem-allocator.cc +++ /dev/null @@ -1,61 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/util/mem-allocator.h" - -#include - -#include "parquet/exception.h" - -namespace parquet { - -MemoryAllocator::~MemoryAllocator() {} - -uint8_t* TrackingAllocator::Malloc(int64_t size) { - if (0 == size) { return nullptr; } - - uint8_t* p = static_cast(std::malloc(size)); - if (!p) { throw ParquetException("OOM: memory allocation failed"); } - { - std::lock_guard lock(stats_mutex_); - total_memory_ += size; - if (total_memory_ > max_memory_) { max_memory_ = total_memory_; } - } - return p; -} - -void TrackingAllocator::Free(uint8_t* p, int64_t size) { - if (nullptr != p && size > 0) { - { - std::lock_guard lock(stats_mutex_); - if (total_memory_ < size) { - throw ParquetException("Attempting to free too much memory"); - } - total_memory_ -= size; - } - std::free(p); - } -} - -TrackingAllocator::~TrackingAllocator() {} - -MemoryAllocator* default_allocator() { - static TrackingAllocator default_allocator; - return &default_allocator; -} - -} // namespace parquet diff --git a/cpp/src/parquet/util/mem-allocator.h b/cpp/src/parquet/util/mem-allocator.h deleted file mode 100644 index a0f3693307e2c..0000000000000 --- a/cpp/src/parquet/util/mem-allocator.h +++ /dev/null @@ -1,59 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_UTIL_MEMORY_POOL_H -#define PARQUET_UTIL_MEMORY_POOL_H - -#include -#include - -#include "parquet/util/visibility.h" - -namespace parquet { - -class PARQUET_EXPORT MemoryAllocator { - public: - virtual ~MemoryAllocator(); - - // Returns nullptr if size is 0 - virtual uint8_t* Malloc(int64_t size) = 0; - virtual void Free(uint8_t* p, int64_t size) = 0; -}; - -PARQUET_EXPORT MemoryAllocator* default_allocator(); - -class PARQUET_EXPORT TrackingAllocator : public MemoryAllocator { - public: - TrackingAllocator() : total_memory_(0), max_memory_(0) {} - virtual ~TrackingAllocator(); - - uint8_t* Malloc(int64_t size) override; - void Free(uint8_t* p, int64_t size) override; - - int64_t TotalMemory() { return total_memory_; } - - int64_t MaxMemory() { return max_memory_; } - - private: - std::mutex stats_mutex_; - int64_t total_memory_; - int64_t max_memory_; -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_MEMORY_POOL_H diff --git a/cpp/src/parquet/util/mem-pool.cc b/cpp/src/parquet/util/mem-pool.cc deleted file mode 100644 index 1ab40bcf5b8aa..0000000000000 --- a/cpp/src/parquet/util/mem-pool.cc +++ /dev/null @@ -1,264 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Initially imported from Apache Impala on 2016-02-23, and has been modified -// since for parquet-cpp - -#include "parquet/util/mem-pool.h" - -#include - -#include -#include -#include -#include - -#include "parquet/util/bit-util.h" -#include "parquet/util/logging.h" - -namespace parquet { - -const int MemPool::INITIAL_CHUNK_SIZE; -const int MemPool::MAX_CHUNK_SIZE; - -MemPool::MemPool(MemoryAllocator* allocator) - : current_chunk_idx_(-1), - next_chunk_size_(INITIAL_CHUNK_SIZE), - total_allocated_bytes_(0), - peak_allocated_bytes_(0), - total_reserved_bytes_(0), - allocator_(allocator) {} - -MemPool::ChunkInfo::ChunkInfo(int64_t size, uint8_t* buf) - : data(buf), size(size), allocated_bytes(0) {} - -MemPool::~MemPool() { - int64_t total_bytes_released = 0; - for (size_t i = 0; i < chunks_.size(); ++i) { - total_bytes_released += chunks_[i].size; - allocator_->Free(chunks_[i].data, chunks_[i].size); - } - - DCHECK(chunks_.empty()) << "Must call FreeAll() or AcquireData() for this pool"; -} - -void MemPool::ReturnPartialAllocation(int byte_size) { - DCHECK_GE(byte_size, 0); - DCHECK(current_chunk_idx_ != -1); - ChunkInfo& info = chunks_[current_chunk_idx_]; - DCHECK_GE(info.allocated_bytes, byte_size); - info.allocated_bytes -= byte_size; - total_allocated_bytes_ -= byte_size; -} - -template -uint8_t* MemPool::Allocate(int size) { - if (size == 0) return NULL; - - int64_t num_bytes = BitUtil::RoundUp(size, 8); - if (current_chunk_idx_ == -1 || - num_bytes + chunks_[current_chunk_idx_].allocated_bytes > - chunks_[current_chunk_idx_].size) { - // If we couldn't allocate a new chunk, return NULL. - if (UNLIKELY(!FindChunk(num_bytes))) return NULL; - } - ChunkInfo& info = chunks_[current_chunk_idx_]; - uint8_t* result = info.data + info.allocated_bytes; - DCHECK_LE(info.allocated_bytes + num_bytes, info.size); - info.allocated_bytes += num_bytes; - total_allocated_bytes_ += num_bytes; - DCHECK_LE(current_chunk_idx_, static_cast(chunks_.size()) - 1); - peak_allocated_bytes_ = std::max(total_allocated_bytes_, peak_allocated_bytes_); - return result; -} - -uint8_t* MemPool::Allocate(int size) { - return Allocate(size); -} - -void MemPool::Clear() { - current_chunk_idx_ = -1; - for (auto chunk = chunks_.begin(); chunk != chunks_.end(); ++chunk) { - chunk->allocated_bytes = 0; - } - total_allocated_bytes_ = 0; - DCHECK(CheckIntegrity(false)); -} - -void MemPool::FreeAll() { - int64_t total_bytes_released = 0; - for (size_t i = 0; i < chunks_.size(); ++i) { - total_bytes_released += chunks_[i].size; - allocator_->Free(chunks_[i].data, chunks_[i].size); - } - chunks_.clear(); - next_chunk_size_ = INITIAL_CHUNK_SIZE; - current_chunk_idx_ = -1; - total_allocated_bytes_ = 0; - total_reserved_bytes_ = 0; -} - -bool MemPool::FindChunk(int64_t min_size) { - // Try to allocate from a free chunk. The first free chunk, if any, will be immediately - // after the current chunk. - int first_free_idx = current_chunk_idx_ + 1; - // (cast size() to signed int in order to avoid everything else being cast to - // unsigned long, in particular -1) - while (++current_chunk_idx_ < static_cast(chunks_.size())) { - // we found a free chunk - DCHECK_EQ(chunks_[current_chunk_idx_].allocated_bytes, 0); - - if (chunks_[current_chunk_idx_].size >= min_size) { - // This chunk is big enough. Move it before the other free chunks. - if (current_chunk_idx_ != first_free_idx) { - std::swap(chunks_[current_chunk_idx_], chunks_[first_free_idx]); - current_chunk_idx_ = first_free_idx; - } - break; - } - } - - if (current_chunk_idx_ == static_cast(chunks_.size())) { - // need to allocate new chunk. - int64_t chunk_size; - DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE); - DCHECK_LE(next_chunk_size_, MAX_CHUNK_SIZE); - - chunk_size = std::max(min_size, next_chunk_size_); - - // Allocate a new chunk. Return early if malloc fails. - uint8_t* buf = allocator_->Malloc(chunk_size); - if (UNLIKELY(buf == NULL)) { - DCHECK_EQ(current_chunk_idx_, static_cast(chunks_.size())); - current_chunk_idx_ = static_cast(chunks_.size()) - 1; - return false; - } - - // If there are no free chunks put it at the end, otherwise before the first free. - if (first_free_idx == static_cast(chunks_.size())) { - chunks_.push_back(ChunkInfo(chunk_size, buf)); - } else { - current_chunk_idx_ = first_free_idx; - auto insert_chunk = chunks_.begin() + current_chunk_idx_; - chunks_.insert(insert_chunk, ChunkInfo(chunk_size, buf)); - } - total_reserved_bytes_ += chunk_size; - // Don't increment the chunk size until the allocation succeeds: if an attempted - // large allocation fails we don't want to increase the chunk size further. - next_chunk_size_ = - static_cast(std::min(chunk_size * 2, MAX_CHUNK_SIZE)); - } - - DCHECK_LT(current_chunk_idx_, static_cast(chunks_.size())); - DCHECK(CheckIntegrity(true)); - return true; -} - -void MemPool::AcquireData(MemPool* src, bool keep_current) { - DCHECK(src->CheckIntegrity(false)); - int num_acquired_chunks; - if (keep_current) { - num_acquired_chunks = src->current_chunk_idx_; - } else if (src->GetFreeOffset() == 0) { - // nothing in the last chunk - num_acquired_chunks = src->current_chunk_idx_; - } else { - num_acquired_chunks = src->current_chunk_idx_ + 1; - } - - if (num_acquired_chunks <= 0) { - if (!keep_current) src->FreeAll(); - return; - } - - auto end_chunk = src->chunks_.begin() + num_acquired_chunks; - int64_t total_transfered_bytes = 0; - for (auto i = src->chunks_.begin(); i != end_chunk; ++i) { - total_transfered_bytes += i->size; - } - src->total_reserved_bytes_ -= total_transfered_bytes; - total_reserved_bytes_ += total_transfered_bytes; - - // insert new chunks after current_chunk_idx_ - auto insert_chunk = chunks_.begin() + current_chunk_idx_ + 1; - chunks_.insert(insert_chunk, src->chunks_.begin(), end_chunk); - src->chunks_.erase(src->chunks_.begin(), end_chunk); - current_chunk_idx_ += num_acquired_chunks; - - if (keep_current) { - src->current_chunk_idx_ = 0; - DCHECK(src->chunks_.size() == 1 || src->chunks_[1].allocated_bytes == 0); - total_allocated_bytes_ += src->total_allocated_bytes_ - src->GetFreeOffset(); - src->total_allocated_bytes_ = src->GetFreeOffset(); - } else { - src->current_chunk_idx_ = -1; - total_allocated_bytes_ += src->total_allocated_bytes_; - src->total_allocated_bytes_ = 0; - } - peak_allocated_bytes_ = std::max(total_allocated_bytes_, peak_allocated_bytes_); - - if (!keep_current) src->FreeAll(); - DCHECK(CheckIntegrity(false)); -} - -std::string MemPool::DebugString() { - std::stringstream out; - char str[16]; - out << "MemPool(#chunks=" << chunks_.size() << " ["; - for (size_t i = 0; i < chunks_.size(); ++i) { - sprintf(str, "0x%lx=", reinterpret_cast(chunks_[i].data)); // NOLINT - out << (i > 0 ? " " : "") << str << chunks_[i].size << "/" - << chunks_[i].allocated_bytes; - } - out << "] current_chunk=" << current_chunk_idx_ - << " total_sizes=" << GetTotalChunkSizes() - << " total_alloc=" << total_allocated_bytes_ << ")"; - return out.str(); -} - -int64_t MemPool::GetTotalChunkSizes() const { - int64_t result = 0; - for (size_t i = 0; i < chunks_.size(); ++i) { - result += chunks_[i].size; - } - return result; -} - -bool MemPool::CheckIntegrity(bool current_chunk_empty) { - // check that current_chunk_idx_ points to the last chunk with allocated data - DCHECK_LT(current_chunk_idx_, static_cast(chunks_.size())); - int64_t total_allocated = 0; - for (int i = 0; i < static_cast(chunks_.size()); ++i) { - DCHECK_GT(chunks_[i].size, 0); - if (i < current_chunk_idx_) { - DCHECK_GT(chunks_[i].allocated_bytes, 0); - } else if (i == current_chunk_idx_) { - if (current_chunk_empty) { - DCHECK_EQ(chunks_[i].allocated_bytes, 0); - } else { - DCHECK_GT(chunks_[i].allocated_bytes, 0); - } - } else { - DCHECK_EQ(chunks_[i].allocated_bytes, 0); - } - total_allocated += chunks_[i].allocated_bytes; - } - DCHECK_EQ(total_allocated, total_allocated_bytes_); - return true; -} - -} // namespace parquet diff --git a/cpp/src/parquet/util/mem-pool.h b/cpp/src/parquet/util/mem-pool.h deleted file mode 100644 index 5f6afa993cd6e..0000000000000 --- a/cpp/src/parquet/util/mem-pool.h +++ /dev/null @@ -1,179 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Initially imported from Apache Impala on 2016-02-23, and has been modified -// since for parquet-cpp - -#ifndef PARQUET_UTIL_MEM_POOL_H -#define PARQUET_UTIL_MEM_POOL_H - -#include -#include -#include -#include -#include - -#include "parquet/util/mem-allocator.h" - -namespace parquet { - -/// A MemPool maintains a list of memory chunks from which it allocates memory -/// in response to Allocate() calls; -/// Chunks stay around for the lifetime of the mempool or until they are passed on to -/// another mempool. -// -/// An Allocate() call will attempt to allocate memory from the chunk that was most -/// recently added; if that chunk doesn't have enough memory to -/// satisfy the allocation request, the free chunks are searched for one that is -/// big enough otherwise a new chunk is added to the list. -/// The current_chunk_idx_ always points to the last chunk with allocated memory. -/// In order to keep allocation overhead low, chunk sizes double with each new one -/// added, until they hit a maximum size. -// -/// Example: -/// MemPool* p = new MemPool(); -/// for (int i = 0; i < 1024; ++i) { -/// returns 8-byte aligned memory (effectively 24 bytes): -/// .. = p->Allocate(17); -/// } -/// at this point, 17K have been handed out in response to Allocate() calls and -/// 28K of chunks have been allocated (chunk sizes: 4K, 8K, 16K) -/// We track total and peak allocated bytes. At this point they would be the same: -/// 28k bytes. A call to Clear will return the allocated memory so -/// total_allocate_bytes_ -/// becomes 0 while peak_allocate_bytes_ remains at 28k. -/// p->Clear(); -/// the entire 1st chunk is returned: -/// .. = p->Allocate(4 * 1024); -/// 4K of the 2nd chunk are returned: -/// .. = p->Allocate(4 * 1024); -/// a new 20K chunk is created -/// .. = p->Allocate(20 * 1024); -// -/// MemPool* p2 = new MemPool(); -/// the new mempool receives all chunks containing data from p -/// p2->AcquireData(p, false); -/// At this point p.total_allocated_bytes_ would be 0 while p.peak_allocated_bytes_ -/// remains unchanged. -/// The one remaining (empty) chunk is released: -/// delete p; - -class MemPool { - public: - explicit MemPool(MemoryAllocator* allocator = default_allocator()); - - /// Frees all chunks of memory and subtracts the total allocated bytes - /// from the registered limits. - ~MemPool(); - - /// Allocates 8-byte aligned section of memory of 'size' bytes at the end - /// of the the current chunk. Creates a new chunk if there aren't any chunks - /// with enough capacity. - uint8_t* Allocate(int size); - - /// Returns 'byte_size' to the current chunk back to the mem pool. This can - /// only be used to return either all or part of the previous allocation returned - /// by Allocate(). - void ReturnPartialAllocation(int byte_size); - - /// Makes all allocated chunks available for re-use, but doesn't delete any chunks. - void Clear(); - - /// Deletes all allocated chunks. FreeAll() or AcquireData() must be called for - /// each mem pool - void FreeAll(); - - /// Absorb all chunks that hold data from src. If keep_current is true, let src hold on - /// to its last allocated chunk that contains data. - /// All offsets handed out by calls to GetCurrentOffset() for 'src' become invalid. - void AcquireData(MemPool* src, bool keep_current); - - std::string DebugString(); - - int64_t total_allocated_bytes() const { return total_allocated_bytes_; } - int64_t peak_allocated_bytes() const { return peak_allocated_bytes_; } - int64_t total_reserved_bytes() const { return total_reserved_bytes_; } - - /// Return sum of chunk_sizes_. - int64_t GetTotalChunkSizes() const; - - private: - friend class MemPoolTest; - static const int INITIAL_CHUNK_SIZE = 4 * 1024; - - /// The maximum size of chunk that should be allocated. Allocations larger than this - /// size will get their own individual chunk. - static const int MAX_CHUNK_SIZE = 1024 * 1024; - - struct ChunkInfo { - uint8_t* data; // Owned by the ChunkInfo. - int64_t size; // in bytes - - /// bytes allocated via Allocate() in this chunk - int64_t allocated_bytes; - - explicit ChunkInfo(int64_t size, uint8_t* buf); - - ChunkInfo() : data(NULL), size(0), allocated_bytes(0) {} - }; - - /// chunk from which we served the last Allocate() call; - /// always points to the last chunk that contains allocated data; - /// chunks 0..current_chunk_idx_ are guaranteed to contain data - /// (chunks_[i].allocated_bytes > 0 for i: 0..current_chunk_idx_); - /// -1 if no chunks present - int current_chunk_idx_; - - /// The size of the next chunk to allocate. - int64_t next_chunk_size_; - - /// sum of allocated_bytes_ - int64_t total_allocated_bytes_; - - /// Maximum number of bytes allocated from this pool at one time. - int64_t peak_allocated_bytes_; - - /// sum of all bytes allocated in chunks_ - int64_t total_reserved_bytes_; - - std::vector chunks_; - - MemoryAllocator* allocator_; - - /// Find or allocated a chunk with at least min_size spare capacity and update - /// current_chunk_idx_. Also updates chunks_, chunk_sizes_ and allocated_bytes_ - /// if a new chunk needs to be created. - bool FindChunk(int64_t min_size); - - /// Check integrity of the supporting data structures; always returns true but DCHECKs - /// all invariants. - /// If 'current_chunk_empty' is false, checks that the current chunk contains data. - bool CheckIntegrity(bool current_chunk_empty); - - /// Return offset to unoccpied space in current chunk. - int GetFreeOffset() const { - if (current_chunk_idx_ == -1) return 0; - return chunks_[current_chunk_idx_].allocated_bytes; - } - - template - uint8_t* Allocate(int size); -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_MEM_POOL_H diff --git a/cpp/src/parquet/util/mem-pool-test.cc b/cpp/src/parquet/util/memory-test.cc similarity index 56% rename from cpp/src/parquet/util/mem-pool-test.cc rename to cpp/src/parquet/util/memory-test.cc index 3f3424b02aa1b..45aa819ed5caf 100644 --- a/cpp/src/parquet/util/mem-pool-test.cc +++ b/cpp/src/parquet/util/memory-test.cc @@ -15,37 +15,82 @@ // specific language governing permissions and limitations // under the License. -// Initially imported from Apache Impala on 2016-02-23, and has been modified -// since for parquet-cpp - #include -#include -#include +#include +#include #include +#include + +#include -#include "parquet/util/bit-util.h" -#include "parquet/util/mem-pool.h" +#include "parquet/exception.h" +#include "parquet/util/memory.h" +#include "parquet/util/test-common.h" namespace parquet { +class TestBuffer : public ::testing::Test {}; + +TEST(TestAllocator, AllocateFree) { + TrackingAllocator allocator; + + uint8_t* data; + + ASSERT_TRUE(allocator.Allocate(100, &data).ok()); + ASSERT_TRUE(nullptr != data); + data[99] = 55; + allocator.Free(data, 100); + + ASSERT_TRUE(allocator.Allocate(0, &data).ok()); + ASSERT_EQ(nullptr, data); + allocator.Free(data, 0); + + int64_t to_alloc = std::numeric_limits::max(); + ASSERT_FALSE(allocator.Allocate(to_alloc, &data).ok()); +} + +TEST(TestAllocator, TotalMax) { + TrackingAllocator allocator; + ASSERT_EQ(0, allocator.bytes_allocated()); + ASSERT_EQ(0, allocator.max_memory()); + + uint8_t* data; + uint8_t* data2; + ASSERT_TRUE(allocator.Allocate(100, &data).ok()); + ASSERT_EQ(100, allocator.bytes_allocated()); + ASSERT_EQ(100, allocator.max_memory()); + + ASSERT_TRUE(allocator.Allocate(10, &data2).ok()); + ASSERT_EQ(110, allocator.bytes_allocated()); + ASSERT_EQ(110, allocator.max_memory()); + + allocator.Free(data, 100); + ASSERT_EQ(10, allocator.bytes_allocated()); + ASSERT_EQ(110, allocator.max_memory()); + + allocator.Free(data2, 10); + ASSERT_EQ(0, allocator.bytes_allocated()); + ASSERT_EQ(110, allocator.max_memory()); +} + // Utility class to call private functions on MemPool. -class MemPoolTest { +class ChunkedAllocatorTest { public: - static bool CheckIntegrity(MemPool* pool, bool current_chunk_empty) { + static bool CheckIntegrity(ChunkedAllocator* pool, bool current_chunk_empty) { return pool->CheckIntegrity(current_chunk_empty); } - static const int INITIAL_CHUNK_SIZE = MemPool::INITIAL_CHUNK_SIZE; - static const int MAX_CHUNK_SIZE = MemPool::MAX_CHUNK_SIZE; + static const int INITIAL_CHUNK_SIZE = ChunkedAllocator::INITIAL_CHUNK_SIZE; + static const int MAX_CHUNK_SIZE = ChunkedAllocator::MAX_CHUNK_SIZE; }; -const int MemPoolTest::INITIAL_CHUNK_SIZE; -const int MemPoolTest::MAX_CHUNK_SIZE; +const int ChunkedAllocatorTest::INITIAL_CHUNK_SIZE; +const int ChunkedAllocatorTest::MAX_CHUNK_SIZE; -TEST(MemPoolTest, Basic) { - MemPool p; - MemPool p2; - MemPool p3; +TEST(ChunkedAllocatorTest, Basic) { + ChunkedAllocator p; + ChunkedAllocator p2; + ChunkedAllocator p3; for (int iter = 0; iter < 2; ++iter) { // allocate a total of 24K in 32-byte pieces (for which we only request 25 bytes) @@ -135,8 +180,8 @@ TEST(MemPoolTest, Basic) { // This case verifies that when chunks are acquired by another memory pool the // remaining chunks are consistent if there were more than one used chunk and some // free chunks. -TEST(MemPoolTest, Keep) { - MemPool p; +TEST(ChunkedAllocatorTest, Keep) { + ChunkedAllocator p; p.Allocate(4 * 1024); p.Allocate(8 * 1024); p.Allocate(16 * 1024); @@ -150,7 +195,7 @@ TEST(MemPoolTest, Keep) { EXPECT_EQ((1 + 4) * 1024, p.total_allocated_bytes()); EXPECT_EQ((4 + 8 + 16) * 1024, p.GetTotalChunkSizes()); - MemPool p2; + ChunkedAllocator p2; p2.AcquireData(&p, true); EXPECT_EQ(4 * 1024, p.total_allocated_bytes()); EXPECT_EQ((8 + 16) * 1024, p.GetTotalChunkSizes()); @@ -162,8 +207,8 @@ TEST(MemPoolTest, Keep) { } // Tests that we can return partial allocations. -TEST(MemPoolTest, ReturnPartial) { - MemPool p; +TEST(ChunkedAllocatorTest, ReturnPartial) { + ChunkedAllocator p; uint8_t* ptr = p.Allocate(1024); EXPECT_EQ(1024, p.total_allocated_bytes()); memset(ptr, 0, 1024); @@ -198,11 +243,11 @@ TEST(MemPoolTest, ReturnPartial) { p.FreeAll(); } -// Test that the MemPool overhead is bounded when we make allocations of +// Test that the ChunkedAllocator overhead is bounded when we make allocations of // INITIAL_CHUNK_SIZE. -TEST(MemPoolTest, MemoryOverhead) { - MemPool p; - const int alloc_size = MemPoolTest::INITIAL_CHUNK_SIZE; +TEST(ChunkedAllocatorTest, MemoryOverhead) { + ChunkedAllocator p; + const int alloc_size = ChunkedAllocatorTest::INITIAL_CHUNK_SIZE; const int num_allocs = 1000; int64_t total_allocated = 0; @@ -214,7 +259,7 @@ TEST(MemPoolTest, MemoryOverhead) { int64_t wasted_memory = p.GetTotalChunkSizes() - total_allocated; // The initial chunk fits evenly into MAX_CHUNK_SIZE, so should have at most // one empty chunk at the end. - EXPECT_LE(wasted_memory, MemPoolTest::MAX_CHUNK_SIZE); + EXPECT_LE(wasted_memory, ChunkedAllocatorTest::MAX_CHUNK_SIZE); // The chunk doubling algorithm should not allocate chunks larger than the total // amount of memory already allocated. EXPECT_LE(wasted_memory, total_allocated); @@ -223,25 +268,118 @@ TEST(MemPoolTest, MemoryOverhead) { p.FreeAll(); } -// Test that the MemPool overhead is bounded when we make alternating large and small -// allocations. -TEST(MemPoolTest, FragmentationOverhead) { - MemPool p; +// Test that the ChunkedAllocator overhead is bounded when we make alternating +// large and small allocations. +TEST(ChunkedAllocatorTest, FragmentationOverhead) { + ChunkedAllocator p; const int num_allocs = 100; int64_t total_allocated = 0; for (int i = 0; i < num_allocs; ++i) { - int alloc_size = i % 2 == 0 ? 1 : MemPoolTest::MAX_CHUNK_SIZE; + int alloc_size = i % 2 == 0 ? 1 : ChunkedAllocatorTest::MAX_CHUNK_SIZE; uint8_t* mem = p.Allocate(alloc_size); ASSERT_TRUE(mem != NULL); total_allocated += alloc_size; int64_t wasted_memory = p.GetTotalChunkSizes() - total_allocated; // Fragmentation should not waste more than half of each completed chunk. - EXPECT_LE(wasted_memory, total_allocated + MemPoolTest::MAX_CHUNK_SIZE); + EXPECT_LE(wasted_memory, total_allocated + ChunkedAllocatorTest::MAX_CHUNK_SIZE); } p.FreeAll(); } +TEST(TestBufferedInputStream, Basics) { + int64_t source_size = 256; + int64_t stream_offset = 10; + int64_t stream_size = source_size - stream_offset; + int64_t chunk_size = 50; + std::shared_ptr buf = AllocateBuffer(default_allocator(), source_size); + ASSERT_EQ(source_size, buf->size()); + for (int i = 0; i < source_size; i++) { + buf->mutable_data()[i] = i; + } + + auto wrapper = + std::make_shared(std::make_shared<::arrow::io::BufferReader>(buf)); + + TrackingAllocator allocator; + std::unique_ptr stream(new BufferedInputStream( + &allocator, chunk_size, wrapper.get(), stream_offset, stream_size)); + + const uint8_t* output; + int64_t bytes_read; + + // source is at offset 10 + output = stream->Peek(10, &bytes_read); + ASSERT_EQ(10, bytes_read); + for (int i = 0; i < 10; i++) { + ASSERT_EQ(10 + i, output[i]) << i; + } + output = stream->Read(10, &bytes_read); + ASSERT_EQ(10, bytes_read); + for (int i = 0; i < 10; i++) { + ASSERT_EQ(10 + i, output[i]) << i; + } + output = stream->Read(10, &bytes_read); + ASSERT_EQ(10, bytes_read); + for (int i = 0; i < 10; i++) { + ASSERT_EQ(20 + i, output[i]) << i; + } + stream->Advance(5); + stream->Advance(5); + // source is at offset 40 + // read across buffer boundary. buffer size is 50 + output = stream->Read(20, &bytes_read); + ASSERT_EQ(20, bytes_read); + for (int i = 0; i < 20; i++) { + ASSERT_EQ(40 + i, output[i]) << i; + } + // read more than original chunk_size + output = stream->Read(60, &bytes_read); + ASSERT_EQ(60, bytes_read); + for (int i = 0; i < 60; i++) { + ASSERT_EQ(60 + i, output[i]) << i; + } + + stream->Advance(120); + // source is at offset 240 + // read outside of source boundary. source size is 256 + output = stream->Read(30, &bytes_read); + ASSERT_EQ(16, bytes_read); + for (int i = 0; i < 16; i++) { + ASSERT_EQ(240 + i, output[i]) << i; + } +} + +TEST(TestArrowInputFile, Basics) { + std::string data = "this is the data"; + auto data_buffer = reinterpret_cast(data.c_str()); + + auto file = std::make_shared<::arrow::io::BufferReader>(data_buffer, data.size()); + auto source = std::make_shared(file); + + ASSERT_EQ(0, source->Tell()); + ASSERT_NO_THROW(source->Seek(5)); + ASSERT_EQ(5, source->Tell()); + ASSERT_NO_THROW(source->Seek(0)); + + // Seek out of bounds + ASSERT_THROW(source->Seek(100), ParquetException); + + uint8_t buffer[50]; + + ASSERT_NO_THROW(source->Read(4, buffer)); + ASSERT_EQ(0, std::memcmp(buffer, "this", 4)); + ASSERT_EQ(4, source->Tell()); + + std::shared_ptr pq_buffer; + + ASSERT_NO_THROW(pq_buffer = source->Read(7)); + + auto expected_buffer = std::make_shared(data_buffer + 4, 7); + + ASSERT_TRUE(expected_buffer->Equals(*pq_buffer.get())); +} + } // namespace parquet diff --git a/cpp/src/parquet/util/memory.cc b/cpp/src/parquet/util/memory.cc new file mode 100644 index 0000000000000..9ad033628f90c --- /dev/null +++ b/cpp/src/parquet/util/memory.cc @@ -0,0 +1,543 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/util/memory.h" + +#include +#include +#include +#include + +#include "parquet/exception.h" +#include "parquet/types.h" +#include "parquet/util/bit-util.h" +#include "parquet/util/logging.h" + +namespace parquet { + +::arrow::Status TrackingAllocator::Allocate(int64_t size, uint8_t** out) { + if (0 == size) { + *out = nullptr; + return ::arrow::Status::OK(); + } + + uint8_t* p = static_cast(std::malloc(size)); + if (!p) { return ::arrow::Status::OutOfMemory("memory allocation failed"); } + { + std::lock_guard lock(stats_mutex_); + total_memory_ += size; + if (total_memory_ > max_memory_) { max_memory_ = total_memory_; } + } + *out = p; + return ::arrow::Status::OK(); +} + +void TrackingAllocator::Free(uint8_t* p, int64_t size) { + if (nullptr != p && size > 0) { + { + std::lock_guard lock(stats_mutex_); + DCHECK_GE(total_memory_, size) << "Attempting to free too much memory"; + total_memory_ -= size; + } + std::free(p); + } +} + +MemoryAllocator* default_allocator() { + static TrackingAllocator allocator; + return &allocator; +} + +template +Vector::Vector(int64_t size, MemoryAllocator* allocator) + : buffer_(AllocateUniqueBuffer(allocator, size * sizeof(T))), + size_(size), + capacity_(size) { + if (size > 0) { + data_ = reinterpret_cast(buffer_->mutable_data()); + } else { + data_ = nullptr; + } +} + +template +void Vector::Reserve(int64_t new_capacity) { + if (new_capacity > capacity_) { + PARQUET_THROW_NOT_OK(buffer_->Resize(new_capacity * sizeof(T))); + data_ = reinterpret_cast(buffer_->mutable_data()); + capacity_ = new_capacity; + } +} + +template +void Vector::Resize(int64_t new_size) { + Reserve(new_size); + size_ = new_size; +} + +template +void Vector::Assign(int64_t size, const T val) { + Resize(size); + for (int64_t i = 0; i < size_; i++) { + data_[i] = val; + } +} + +template +void Vector::Swap(Vector& v) { + buffer_.swap(v.buffer_); + std::swap(size_, v.size_); + std::swap(capacity_, v.capacity_); + std::swap(data_, v.data_); +} + +template class Vector; +template class Vector; +template class Vector; +template class Vector; +template class Vector; +template class Vector; +template class Vector; +template class Vector; + +const int ChunkedAllocator::INITIAL_CHUNK_SIZE; +const int ChunkedAllocator::MAX_CHUNK_SIZE; + +ChunkedAllocator::ChunkedAllocator(MemoryAllocator* allocator) + : current_chunk_idx_(-1), + next_chunk_size_(INITIAL_CHUNK_SIZE), + total_allocated_bytes_(0), + peak_allocated_bytes_(0), + total_reserved_bytes_(0), + allocator_(allocator) {} + +ChunkedAllocator::ChunkInfo::ChunkInfo(int64_t size, uint8_t* buf) + : data(buf), size(size), allocated_bytes(0) {} + +ChunkedAllocator::~ChunkedAllocator() { + int64_t total_bytes_released = 0; + for (size_t i = 0; i < chunks_.size(); ++i) { + total_bytes_released += chunks_[i].size; + allocator_->Free(chunks_[i].data, chunks_[i].size); + } + + DCHECK(chunks_.empty()) << "Must call FreeAll() or AcquireData() for this pool"; +} + +void ChunkedAllocator::ReturnPartialAllocation(int byte_size) { + DCHECK_GE(byte_size, 0); + DCHECK(current_chunk_idx_ != -1); + ChunkInfo& info = chunks_[current_chunk_idx_]; + DCHECK_GE(info.allocated_bytes, byte_size); + info.allocated_bytes -= byte_size; + total_allocated_bytes_ -= byte_size; +} + +template +uint8_t* ChunkedAllocator::Allocate(int size) { + if (size == 0) return NULL; + + int64_t num_bytes = BitUtil::RoundUp(size, 8); + if (current_chunk_idx_ == -1 || + num_bytes + chunks_[current_chunk_idx_].allocated_bytes > + chunks_[current_chunk_idx_].size) { + // If we couldn't allocate a new chunk, return NULL. + if (UNLIKELY(!FindChunk(num_bytes))) return NULL; + } + ChunkInfo& info = chunks_[current_chunk_idx_]; + uint8_t* result = info.data + info.allocated_bytes; + DCHECK_LE(info.allocated_bytes + num_bytes, info.size); + info.allocated_bytes += num_bytes; + total_allocated_bytes_ += num_bytes; + DCHECK_LE(current_chunk_idx_, static_cast(chunks_.size()) - 1); + peak_allocated_bytes_ = std::max(total_allocated_bytes_, peak_allocated_bytes_); + return result; +} + +uint8_t* ChunkedAllocator::Allocate(int size) { + return Allocate(size); +} + +void ChunkedAllocator::Clear() { + current_chunk_idx_ = -1; + for (auto chunk = chunks_.begin(); chunk != chunks_.end(); ++chunk) { + chunk->allocated_bytes = 0; + } + total_allocated_bytes_ = 0; + DCHECK(CheckIntegrity(false)); +} + +void ChunkedAllocator::FreeAll() { + int64_t total_bytes_released = 0; + for (size_t i = 0; i < chunks_.size(); ++i) { + total_bytes_released += chunks_[i].size; + allocator_->Free(chunks_[i].data, chunks_[i].size); + } + chunks_.clear(); + next_chunk_size_ = INITIAL_CHUNK_SIZE; + current_chunk_idx_ = -1; + total_allocated_bytes_ = 0; + total_reserved_bytes_ = 0; +} + +bool ChunkedAllocator::FindChunk(int64_t min_size) { + // Try to allocate from a free chunk. The first free chunk, if any, will be immediately + // after the current chunk. + int first_free_idx = current_chunk_idx_ + 1; + // (cast size() to signed int in order to avoid everything else being cast to + // unsigned long, in particular -1) + while (++current_chunk_idx_ < static_cast(chunks_.size())) { + // we found a free chunk + DCHECK_EQ(chunks_[current_chunk_idx_].allocated_bytes, 0); + + if (chunks_[current_chunk_idx_].size >= min_size) { + // This chunk is big enough. Move it before the other free chunks. + if (current_chunk_idx_ != first_free_idx) { + std::swap(chunks_[current_chunk_idx_], chunks_[first_free_idx]); + current_chunk_idx_ = first_free_idx; + } + break; + } + } + + if (current_chunk_idx_ == static_cast(chunks_.size())) { + // need to allocate new chunk. + int64_t chunk_size; + DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE); + DCHECK_LE(next_chunk_size_, MAX_CHUNK_SIZE); + + chunk_size = std::max(min_size, next_chunk_size_); + + // Allocate a new chunk. Return early if malloc fails. + uint8_t* buf = nullptr; + PARQUET_THROW_NOT_OK(allocator_->Allocate(chunk_size, &buf)); + if (UNLIKELY(buf == NULL)) { + DCHECK_EQ(current_chunk_idx_, static_cast(chunks_.size())); + current_chunk_idx_ = static_cast(chunks_.size()) - 1; + return false; + } + + // If there are no free chunks put it at the end, otherwise before the first free. + if (first_free_idx == static_cast(chunks_.size())) { + chunks_.push_back(ChunkInfo(chunk_size, buf)); + } else { + current_chunk_idx_ = first_free_idx; + auto insert_chunk = chunks_.begin() + current_chunk_idx_; + chunks_.insert(insert_chunk, ChunkInfo(chunk_size, buf)); + } + total_reserved_bytes_ += chunk_size; + // Don't increment the chunk size until the allocation succeeds: if an attempted + // large allocation fails we don't want to increase the chunk size further. + next_chunk_size_ = + static_cast(std::min(chunk_size * 2, MAX_CHUNK_SIZE)); + } + + DCHECK_LT(current_chunk_idx_, static_cast(chunks_.size())); + DCHECK(CheckIntegrity(true)); + return true; +} + +void ChunkedAllocator::AcquireData(ChunkedAllocator* src, bool keep_current) { + DCHECK(src->CheckIntegrity(false)); + int num_acquired_chunks; + if (keep_current) { + num_acquired_chunks = src->current_chunk_idx_; + } else if (src->GetFreeOffset() == 0) { + // nothing in the last chunk + num_acquired_chunks = src->current_chunk_idx_; + } else { + num_acquired_chunks = src->current_chunk_idx_ + 1; + } + + if (num_acquired_chunks <= 0) { + if (!keep_current) src->FreeAll(); + return; + } + + auto end_chunk = src->chunks_.begin() + num_acquired_chunks; + int64_t total_transfered_bytes = 0; + for (auto i = src->chunks_.begin(); i != end_chunk; ++i) { + total_transfered_bytes += i->size; + } + src->total_reserved_bytes_ -= total_transfered_bytes; + total_reserved_bytes_ += total_transfered_bytes; + + // insert new chunks after current_chunk_idx_ + auto insert_chunk = chunks_.begin() + current_chunk_idx_ + 1; + chunks_.insert(insert_chunk, src->chunks_.begin(), end_chunk); + src->chunks_.erase(src->chunks_.begin(), end_chunk); + current_chunk_idx_ += num_acquired_chunks; + + if (keep_current) { + src->current_chunk_idx_ = 0; + DCHECK(src->chunks_.size() == 1 || src->chunks_[1].allocated_bytes == 0); + total_allocated_bytes_ += src->total_allocated_bytes_ - src->GetFreeOffset(); + src->total_allocated_bytes_ = src->GetFreeOffset(); + } else { + src->current_chunk_idx_ = -1; + total_allocated_bytes_ += src->total_allocated_bytes_; + src->total_allocated_bytes_ = 0; + } + peak_allocated_bytes_ = std::max(total_allocated_bytes_, peak_allocated_bytes_); + + if (!keep_current) src->FreeAll(); + DCHECK(CheckIntegrity(false)); +} + +std::string ChunkedAllocator::DebugString() { + std::stringstream out; + char str[16]; + out << "ChunkedAllocator(#chunks=" << chunks_.size() << " ["; + for (size_t i = 0; i < chunks_.size(); ++i) { + sprintf(str, "0x%lx=", reinterpret_cast(chunks_[i].data)); // NOLINT + out << (i > 0 ? " " : "") << str << chunks_[i].size << "/" + << chunks_[i].allocated_bytes; + } + out << "] current_chunk=" << current_chunk_idx_ + << " total_sizes=" << GetTotalChunkSizes() + << " total_alloc=" << total_allocated_bytes_ << ")"; + return out.str(); +} + +int64_t ChunkedAllocator::GetTotalChunkSizes() const { + int64_t result = 0; + for (size_t i = 0; i < chunks_.size(); ++i) { + result += chunks_[i].size; + } + return result; +} + +bool ChunkedAllocator::CheckIntegrity(bool current_chunk_empty) { + // check that current_chunk_idx_ points to the last chunk with allocated data + DCHECK_LT(current_chunk_idx_, static_cast(chunks_.size())); + int64_t total_allocated = 0; + for (int i = 0; i < static_cast(chunks_.size()); ++i) { + DCHECK_GT(chunks_[i].size, 0); + if (i < current_chunk_idx_) { + DCHECK_GT(chunks_[i].allocated_bytes, 0); + } else if (i == current_chunk_idx_) { + if (current_chunk_empty) { + DCHECK_EQ(chunks_[i].allocated_bytes, 0); + } else { + DCHECK_GT(chunks_[i].allocated_bytes, 0); + } + } else { + DCHECK_EQ(chunks_[i].allocated_bytes, 0); + } + total_allocated += chunks_[i].allocated_bytes; + } + DCHECK_EQ(total_allocated, total_allocated_bytes_); + return true; +} + +// ---------------------------------------------------------------------- +// Arrow IO wrappers + +// Close the output stream +void ArrowFileMethods::Close() { + PARQUET_THROW_NOT_OK(file_interface()->Close()); +} + +// Return the current position in the output stream relative to the start +int64_t ArrowFileMethods::Tell() { + int64_t position = 0; + PARQUET_THROW_NOT_OK(file_interface()->Tell(&position)); + return position; +} + +ArrowInputFile::ArrowInputFile( + const std::shared_ptr<::arrow::io::ReadableFileInterface>& file) + : file_(file) {} + +::arrow::io::FileInterface* ArrowInputFile::file_interface() { + return file_.get(); +} + +int64_t ArrowInputFile::Size() const { + int64_t size; + PARQUET_THROW_NOT_OK(file_->GetSize(&size)); + return size; +} + +void ArrowInputFile::Seek(int64_t position) { + PARQUET_THROW_NOT_OK(file_->Seek(position)); +} + +// Returns bytes read +int64_t ArrowInputFile::Read(int64_t nbytes, uint8_t* out) { + int64_t bytes_read = 0; + PARQUET_THROW_NOT_OK(file_->Read(nbytes, &bytes_read, out)); + return bytes_read; +} + +std::shared_ptr ArrowInputFile::Read(int64_t nbytes) { + std::shared_ptr out; + PARQUET_THROW_NOT_OK(file_->Read(nbytes, &out)); + return out; +} + +std::shared_ptr ArrowInputFile::ReadAt(int64_t position, int64_t nbytes) { + std::shared_ptr out; + PARQUET_THROW_NOT_OK(file_->ReadAt(position, nbytes, &out)); + return out; +} + +ArrowOutputStream::ArrowOutputStream( + const std::shared_ptr<::arrow::io::OutputStream> file) + : file_(file) {} + +::arrow::io::FileInterface* ArrowOutputStream::file_interface() { + return file_.get(); +} + +// Copy bytes into the output stream +void ArrowOutputStream::Write(const uint8_t* data, int64_t length) { + PARQUET_THROW_NOT_OK(file_->Write(data, length)); +} + +// ---------------------------------------------------------------------- +// InMemoryInputStream + +InMemoryInputStream::InMemoryInputStream(const std::shared_ptr& buffer) + : buffer_(buffer), offset_(0) { + len_ = buffer_->size(); +} + +InMemoryInputStream::InMemoryInputStream( + RandomAccessSource* source, int64_t start, int64_t num_bytes) + : offset_(0) { + buffer_ = source->ReadAt(start, num_bytes); + if (buffer_->size() < num_bytes) { + throw ParquetException("Unable to read column chunk data"); + } + len_ = buffer_->size(); +} + +const uint8_t* InMemoryInputStream::Peek(int64_t num_to_peek, int64_t* num_bytes) { + *num_bytes = std::min(static_cast(num_to_peek), len_ - offset_); + return buffer_->data() + offset_; +} + +const uint8_t* InMemoryInputStream::Read(int64_t num_to_read, int64_t* num_bytes) { + const uint8_t* result = Peek(num_to_read, num_bytes); + offset_ += *num_bytes; + return result; +} + +void InMemoryInputStream::Advance(int64_t num_bytes) { + offset_ += num_bytes; +} + +// ---------------------------------------------------------------------- +// In-memory output stream + +InMemoryOutputStream::InMemoryOutputStream( + MemoryAllocator* allocator, int64_t initial_capacity) + : size_(0), capacity_(initial_capacity) { + if (initial_capacity == 0) { initial_capacity = kInMemoryDefaultCapacity; } + buffer_ = AllocateBuffer(allocator, initial_capacity); +} + +InMemoryOutputStream::~InMemoryOutputStream() {} + +uint8_t* InMemoryOutputStream::Head() { + return buffer_->mutable_data() + size_; +} + +void InMemoryOutputStream::Write(const uint8_t* data, int64_t length) { + if (size_ + length > capacity_) { + int64_t new_capacity = capacity_ * 2; + while (new_capacity < size_ + length) { + new_capacity *= 2; + } + PARQUET_THROW_NOT_OK(buffer_->Resize(new_capacity)); + capacity_ = new_capacity; + } + memcpy(Head(), data, length); + size_ += length; +} + +int64_t InMemoryOutputStream::Tell() { + return size_; +} + +std::shared_ptr InMemoryOutputStream::GetBuffer() { + PARQUET_THROW_NOT_OK(buffer_->Resize(size_)); + std::shared_ptr result = buffer_; + buffer_ = nullptr; + return result; +} + +// ---------------------------------------------------------------------- +// BufferedInputStream + +BufferedInputStream::BufferedInputStream(MemoryAllocator* pool, int64_t buffer_size, + RandomAccessSource* source, int64_t start, int64_t num_bytes) + : source_(source), stream_offset_(start), stream_end_(start + num_bytes) { + buffer_ = AllocateBuffer(pool, buffer_size); + buffer_size_ = buffer_->size(); + // Required to force a lazy read + buffer_offset_ = buffer_size_; +} + +const uint8_t* BufferedInputStream::Peek(int64_t num_to_peek, int64_t* num_bytes) { + *num_bytes = std::min(num_to_peek, stream_end_ - stream_offset_); + // increase the buffer size if needed + if (*num_bytes > buffer_size_) { + PARQUET_THROW_NOT_OK(buffer_->Resize(*num_bytes)); + buffer_size_ = buffer_->size(); + DCHECK(buffer_size_ >= *num_bytes); + } + // Read more data when buffer has insufficient left or when resized + if (*num_bytes > (buffer_size_ - buffer_offset_)) { + source_->Seek(stream_offset_); + buffer_size_ = std::min(buffer_size_, stream_end_ - stream_offset_); + int64_t bytes_read = source_->Read(buffer_size_, buffer_->mutable_data()); + if (bytes_read < *num_bytes) { + throw ParquetException("Failed reading column data from source"); + } + buffer_offset_ = 0; + } + return buffer_->data() + buffer_offset_; +} + +const uint8_t* BufferedInputStream::Read(int64_t num_to_read, int64_t* num_bytes) { + const uint8_t* result = Peek(num_to_read, num_bytes); + stream_offset_ += *num_bytes; + buffer_offset_ += *num_bytes; + return result; +} + +void BufferedInputStream::Advance(int64_t num_bytes) { + stream_offset_ += num_bytes; + buffer_offset_ += num_bytes; +} + +std::shared_ptr AllocateBuffer(MemoryAllocator* allocator, int64_t size) { + auto result = std::make_shared(allocator); + if (size > 0) { PARQUET_THROW_NOT_OK(result->Resize(size)); } + return result; +} + +std::unique_ptr AllocateUniqueBuffer( + MemoryAllocator* allocator, int64_t size) { + std::unique_ptr result(new PoolBuffer(allocator)); + if (size > 0) { PARQUET_THROW_NOT_OK(result->Resize(size)); } + return result; +} + +} // namespace parquet diff --git a/cpp/src/parquet/util/memory.h b/cpp/src/parquet/util/memory.h new file mode 100644 index 0000000000000..1ffca3516d525 --- /dev/null +++ b/cpp/src/parquet/util/memory.h @@ -0,0 +1,440 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PARQUET_UTIL_MEMORY_H +#define PARQUET_UTIL_MEMORY_H + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" + +#include "parquet/exception.h" +#include "parquet/util/macros.h" +#include "parquet/util/visibility.h" + +#define PARQUET_CATCH_NOT_OK(s) \ + try { \ + (s); \ + } catch (const ::parquet::ParquetException& e) { \ + return ::arrow::Status::IOError(e.what()); \ + } + +#define PARQUET_IGNORE_NOT_OK(s) \ + try { \ + (s); \ + } catch (const ::parquet::ParquetException& e) {} + +#define PARQUET_THROW_NOT_OK(s) \ + do { \ + ::arrow::Status _s = (s); \ + if (!_s.ok()) { \ + std::stringstream ss; \ + ss << "Arrow error: " << _s.ToString(); \ + ::parquet::ParquetException::Throw(ss.str()); \ + } \ + } while (0); + +namespace parquet { + +static constexpr int64_t kInMemoryDefaultCapacity = 1024; + +using Buffer = ::arrow::Buffer; +using MutableBuffer = ::arrow::MutableBuffer; +using ResizableBuffer = ::arrow::ResizableBuffer; +using PoolBuffer = ::arrow::PoolBuffer; +using MemoryAllocator = ::arrow::MemoryPool; + +PARQUET_EXPORT MemoryAllocator* default_allocator(); + +class PARQUET_EXPORT TrackingAllocator : public MemoryAllocator { + public: + TrackingAllocator() : total_memory_(0), max_memory_(0) {} + + ::arrow::Status Allocate(int64_t size, uint8_t** out) override; + void Free(uint8_t* p, int64_t size) override; + + int64_t bytes_allocated() const override { return total_memory_; } + + int64_t max_memory() { return max_memory_; } + + private: + std::mutex stats_mutex_; + int64_t total_memory_; + int64_t max_memory_; +}; + +template +class Vector { + public: + explicit Vector(int64_t size, MemoryAllocator* allocator); + void Resize(int64_t new_size); + void Reserve(int64_t new_capacity); + void Assign(int64_t size, const T val); + void Swap(Vector& v); + inline T& operator[](int64_t i) const { return data_[i]; } + + private: + std::unique_ptr buffer_; + int64_t size_; + int64_t capacity_; + T* data_; + + DISALLOW_COPY_AND_ASSIGN(Vector); +}; + +/// A ChunkedAllocator maintains a list of memory chunks from which it +/// allocates memory in response to Allocate() calls; Chunks stay around for +/// the lifetime of the allocator or until they are passed on to another +/// allocator. +// +/// An Allocate() call will attempt to allocate memory from the chunk that was most +/// recently added; if that chunk doesn't have enough memory to +/// satisfy the allocation request, the free chunks are searched for one that is +/// big enough otherwise a new chunk is added to the list. +/// The current_chunk_idx_ always points to the last chunk with allocated memory. +/// In order to keep allocation overhead low, chunk sizes double with each new one +/// added, until they hit a maximum size. +// +/// Example: +/// ChunkedAllocator* p = new ChunkedAllocator(); +/// for (int i = 0; i < 1024; ++i) { +/// returns 8-byte aligned memory (effectively 24 bytes): +/// .. = p->Allocate(17); +/// } +/// at this point, 17K have been handed out in response to Allocate() calls and +/// 28K of chunks have been allocated (chunk sizes: 4K, 8K, 16K) +/// We track total and peak allocated bytes. At this point they would be the same: +/// 28k bytes. A call to Clear will return the allocated memory so +/// total_allocate_bytes_ +/// becomes 0 while peak_allocate_bytes_ remains at 28k. +/// p->Clear(); +/// the entire 1st chunk is returned: +/// .. = p->Allocate(4 * 1024); +/// 4K of the 2nd chunk are returned: +/// .. = p->Allocate(4 * 1024); +/// a new 20K chunk is created +/// .. = p->Allocate(20 * 1024); +// +/// ChunkedAllocator* p2 = new ChunkedAllocator(); +/// the new ChunkedAllocator receives all chunks containing data from p +/// p2->AcquireData(p, false); +/// At this point p.total_allocated_bytes_ would be 0 while p.peak_allocated_bytes_ +/// remains unchanged. +/// The one remaining (empty) chunk is released: +/// delete p; + +class ChunkedAllocator { + public: + explicit ChunkedAllocator(MemoryAllocator* allocator = default_allocator()); + + /// Frees all chunks of memory and subtracts the total allocated bytes + /// from the registered limits. + ~ChunkedAllocator(); + + /// Allocates 8-byte aligned section of memory of 'size' bytes at the end + /// of the the current chunk. Creates a new chunk if there aren't any chunks + /// with enough capacity. + uint8_t* Allocate(int size); + + /// Returns 'byte_size' to the current chunk back to the mem pool. This can + /// only be used to return either all or part of the previous allocation returned + /// by Allocate(). + void ReturnPartialAllocation(int byte_size); + + /// Makes all allocated chunks available for re-use, but doesn't delete any chunks. + void Clear(); + + /// Deletes all allocated chunks. FreeAll() or AcquireData() must be called for + /// each mem pool + void FreeAll(); + + /// Absorb all chunks that hold data from src. If keep_current is true, let src hold on + /// to its last allocated chunk that contains data. + /// All offsets handed out by calls to GetCurrentOffset() for 'src' become invalid. + void AcquireData(ChunkedAllocator* src, bool keep_current); + + std::string DebugString(); + + int64_t total_allocated_bytes() const { return total_allocated_bytes_; } + int64_t peak_allocated_bytes() const { return peak_allocated_bytes_; } + int64_t total_reserved_bytes() const { return total_reserved_bytes_; } + + /// Return sum of chunk_sizes_. + int64_t GetTotalChunkSizes() const; + + private: + friend class ChunkedAllocatorTest; + static const int INITIAL_CHUNK_SIZE = 4 * 1024; + + /// The maximum size of chunk that should be allocated. Allocations larger than this + /// size will get their own individual chunk. + static const int MAX_CHUNK_SIZE = 1024 * 1024; + + struct ChunkInfo { + uint8_t* data; // Owned by the ChunkInfo. + int64_t size; // in bytes + + /// bytes allocated via Allocate() in this chunk + int64_t allocated_bytes; + + explicit ChunkInfo(int64_t size, uint8_t* buf); + + ChunkInfo() : data(NULL), size(0), allocated_bytes(0) {} + }; + + /// chunk from which we served the last Allocate() call; + /// always points to the last chunk that contains allocated data; + /// chunks 0..current_chunk_idx_ are guaranteed to contain data + /// (chunks_[i].allocated_bytes > 0 for i: 0..current_chunk_idx_); + /// -1 if no chunks present + int current_chunk_idx_; + + /// The size of the next chunk to allocate. + int64_t next_chunk_size_; + + /// sum of allocated_bytes_ + int64_t total_allocated_bytes_; + + /// Maximum number of bytes allocated from this pool at one time. + int64_t peak_allocated_bytes_; + + /// sum of all bytes allocated in chunks_ + int64_t total_reserved_bytes_; + + std::vector chunks_; + + MemoryAllocator* allocator_; + + /// Find or allocated a chunk with at least min_size spare capacity and update + /// current_chunk_idx_. Also updates chunks_, chunk_sizes_ and allocated_bytes_ + /// if a new chunk needs to be created. + bool FindChunk(int64_t min_size); + + /// Check integrity of the supporting data structures; always returns true but DCHECKs + /// all invariants. + /// If 'current_chunk_empty' is false, checks that the current chunk contains data. + bool CheckIntegrity(bool current_chunk_empty); + + /// Return offset to unoccpied space in current chunk. + int GetFreeOffset() const { + if (current_chunk_idx_ == -1) return 0; + return chunks_[current_chunk_idx_].allocated_bytes; + } + + template + uint8_t* Allocate(int size); +}; + +// File input and output interfaces that translate arrow::Status to exceptions + +class PARQUET_EXPORT FileInterface { + public: + // Close the file + virtual void Close() = 0; + + // Return the current position in the file relative to the start + virtual int64_t Tell() = 0; +}; + +class PARQUET_EXPORT RandomAccessSource : virtual public FileInterface { + public: + virtual ~RandomAccessSource() {} + + virtual int64_t Size() const = 0; + + virtual void Seek(int64_t position) = 0; + + // Returns bytes read + virtual int64_t Read(int64_t nbytes, uint8_t* out) = 0; + + virtual std::shared_ptr Read(int64_t nbytes) = 0; + + virtual std::shared_ptr ReadAt(int64_t position, int64_t nbytes) = 0; +}; + +class PARQUET_EXPORT OutputStream : virtual public FileInterface { + public: + virtual ~OutputStream() {} + + // Copy bytes into the output stream + virtual void Write(const uint8_t* data, int64_t length) = 0; +}; + +class PARQUET_EXPORT ArrowFileMethods : virtual public FileInterface { + public: + void Close() override; + int64_t Tell() override; + + protected: + virtual ::arrow::io::FileInterface* file_interface() = 0; +}; + +class PARQUET_EXPORT ArrowInputFile : public ArrowFileMethods, public RandomAccessSource { + public: + explicit ArrowInputFile( + const std::shared_ptr<::arrow::io::ReadableFileInterface>& file); + + int64_t Size() const override; + + void Seek(int64_t position) override; + + // Returns bytes read + int64_t Read(int64_t nbytes, uint8_t* out) override; + + std::shared_ptr Read(int64_t nbytes) override; + + std::shared_ptr ReadAt(int64_t position, int64_t nbytes) override; + + std::shared_ptr<::arrow::io::ReadableFileInterface> file() const { return file_; } + + // Diamond inheritance + using ArrowFileMethods::Close; + using ArrowFileMethods::Tell; + + private: + ::arrow::io::FileInterface* file_interface() override; + std::shared_ptr<::arrow::io::ReadableFileInterface> file_; +}; + +class PARQUET_EXPORT ArrowOutputStream : public ArrowFileMethods, public OutputStream { + public: + explicit ArrowOutputStream(const std::shared_ptr<::arrow::io::OutputStream> file); + + // Copy bytes into the output stream + void Write(const uint8_t* data, int64_t length) override; + + std::shared_ptr<::arrow::io::OutputStream> file() { return file_; } + + // Diamond inheritance + using ArrowFileMethods::Close; + using ArrowFileMethods::Tell; + + private: + ::arrow::io::FileInterface* file_interface() override; + std::shared_ptr<::arrow::io::OutputStream> file_; +}; + +class PARQUET_EXPORT InMemoryOutputStream : public OutputStream { + public: + explicit InMemoryOutputStream(MemoryAllocator* allocator = default_allocator(), + int64_t initial_capacity = kInMemoryDefaultCapacity); + + virtual ~InMemoryOutputStream(); + + // Close is currently a no-op with the in-memory stream + virtual void Close() {} + + virtual int64_t Tell(); + + virtual void Write(const uint8_t* data, int64_t length); + + // Return complete stream as Buffer + std::shared_ptr GetBuffer(); + + private: + // Mutable pointer to the current write position in the stream + uint8_t* Head(); + + std::shared_ptr buffer_; + int64_t size_; + int64_t capacity_; + + DISALLOW_COPY_AND_ASSIGN(InMemoryOutputStream); +}; + +// ---------------------------------------------------------------------- +// Streaming input interfaces + +// Interface for the column reader to get the bytes. The interface is a stream +// interface, meaning the bytes in order and once a byte is read, it does not +// need to be read again. +class InputStream { + public: + // Returns the next 'num_to_peek' without advancing the current position. + // *num_bytes will contain the number of bytes returned which can only be + // less than num_to_peek at end of stream cases. + // Since the position is not advanced, calls to this function are idempotent. + // The buffer returned to the caller is still owned by the input stream and must + // stay valid until the next call to Peek() or Read(). + virtual const uint8_t* Peek(int64_t num_to_peek, int64_t* num_bytes) = 0; + + // Identical to Peek(), except the current position in the stream is advanced by + // *num_bytes. + virtual const uint8_t* Read(int64_t num_to_read, int64_t* num_bytes) = 0; + + // Advance the stream without reading + virtual void Advance(int64_t num_bytes) = 0; + + virtual ~InputStream() {} + + protected: + InputStream() {} +}; + +// Implementation of an InputStream when all the bytes are in memory. +class InMemoryInputStream : public InputStream { + public: + InMemoryInputStream(RandomAccessSource* source, int64_t start, int64_t end); + explicit InMemoryInputStream(const std::shared_ptr& buffer); + virtual const uint8_t* Peek(int64_t num_to_peek, int64_t* num_bytes); + virtual const uint8_t* Read(int64_t num_to_read, int64_t* num_bytes); + + virtual void Advance(int64_t num_bytes); + + private: + std::shared_ptr buffer_; + int64_t len_; + int64_t offset_; +}; + +// Implementation of an InputStream when only some of the bytes are in memory. +class BufferedInputStream : public InputStream { + public: + BufferedInputStream(MemoryAllocator* pool, int64_t buffer_size, + RandomAccessSource* source, int64_t start, int64_t end); + virtual const uint8_t* Peek(int64_t num_to_peek, int64_t* num_bytes); + virtual const uint8_t* Read(int64_t num_to_read, int64_t* num_bytes); + + virtual void Advance(int64_t num_bytes); + + private: + std::shared_ptr buffer_; + RandomAccessSource* source_; + int64_t stream_offset_; + int64_t stream_end_; + int64_t buffer_offset_; + int64_t buffer_size_; +}; + +std::shared_ptr AllocateBuffer(MemoryAllocator* allocator, int64_t size = 0); + +std::unique_ptr AllocateUniqueBuffer( + MemoryAllocator* allocator, int64_t size = 0); + +} // namespace parquet + +#endif // PARQUET_UTIL_MEMORY_H diff --git a/cpp/src/parquet/util/output.cc b/cpp/src/parquet/util/output.cc deleted file mode 100644 index 422000f190155..0000000000000 --- a/cpp/src/parquet/util/output.cc +++ /dev/null @@ -1,118 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/util/output.h" - -#include -#include -#include - -#include "parquet/exception.h" -#include "parquet/util/buffer.h" -#include "parquet/util/logging.h" - -namespace parquet { - -// ---------------------------------------------------------------------- -// OutputStream - -OutputStream::~OutputStream() {} - -// ---------------------------------------------------------------------- -// In-memory output stream - -InMemoryOutputStream::InMemoryOutputStream( - int64_t initial_capacity, MemoryAllocator* allocator) - : size_(0), capacity_(initial_capacity) { - if (initial_capacity == 0) { initial_capacity = IN_MEMORY_DEFAULT_CAPACITY; } - buffer_.reset(new OwnedMutableBuffer(initial_capacity, allocator)); -} - -InMemoryOutputStream::~InMemoryOutputStream() {} - -uint8_t* InMemoryOutputStream::Head() { - return buffer_->mutable_data() + size_; -} - -void InMemoryOutputStream::Write(const uint8_t* data, int64_t length) { - if (size_ + length > capacity_) { - int64_t new_capacity = capacity_ * 2; - while (new_capacity < size_ + length) { - new_capacity *= 2; - } - buffer_->Resize(new_capacity); - capacity_ = new_capacity; - } - memcpy(Head(), data, length); - size_ += length; -} - -int64_t InMemoryOutputStream::Tell() { - return size_; -} - -std::shared_ptr InMemoryOutputStream::GetBuffer() { - buffer_->Resize(size_); - std::shared_ptr result = buffer_; - buffer_ = nullptr; - return result; -} - -// ---------------------------------------------------------------------- -// local file output stream - -LocalFileOutputStream::LocalFileOutputStream(const std::string& path) : is_open_(true) { - file_ = fopen(path.c_str(), "wb"); - if (file_ == nullptr || ferror(file_)) { - std::stringstream ss; - ss << "Unable to open file: " << path; - throw ParquetException(ss.str()); - } -} - -LocalFileOutputStream::~LocalFileOutputStream() { - CloseFile(); -} - -void LocalFileOutputStream::Close() { - CloseFile(); -} - -int64_t LocalFileOutputStream::Tell() { - DCHECK(is_open_); - int64_t position = ftell(file_); - if (position < 0) { throw ParquetException("ftell failed, did the file disappear?"); } - return position; -} - -void LocalFileOutputStream::Write(const uint8_t* data, int64_t length) { - DCHECK(is_open_); - int64_t bytes_written = fwrite(data, sizeof(uint8_t), length, file_); - if (bytes_written != length) { - int error_code = ferror(file_); - throw ParquetException("fwrite failed, error code: " + std::to_string(error_code)); - } -} - -void LocalFileOutputStream::CloseFile() { - if (is_open_) { - fclose(file_); - is_open_ = false; - } -} - -} // namespace parquet diff --git a/cpp/src/parquet/util/output.h b/cpp/src/parquet/util/output.h deleted file mode 100644 index 9b2c2d3debece..0000000000000 --- a/cpp/src/parquet/util/output.h +++ /dev/null @@ -1,107 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_UTIL_OUTPUT_H -#define PARQUET_UTIL_OUTPUT_H - -#include -#include -#include - -#include "parquet/util/macros.h" -#include "parquet/util/mem-allocator.h" -#include "parquet/util/visibility.h" - -namespace parquet { - -class Buffer; -class ResizableBuffer; - -// ---------------------------------------------------------------------- -// Output stream classes - -// Abstract output stream -class PARQUET_EXPORT OutputStream { - public: - virtual ~OutputStream(); - - // Close the output stream - virtual void Close() = 0; - - // Return the current position in the output stream relative to the start - virtual int64_t Tell() = 0; - - // Copy bytes into the output stream - virtual void Write(const uint8_t* data, int64_t length) = 0; -}; - -static constexpr int64_t IN_MEMORY_DEFAULT_CAPACITY = 1024; - -// An output stream that is an in-memory -class PARQUET_EXPORT InMemoryOutputStream : public OutputStream { - public: - explicit InMemoryOutputStream(int64_t initial_capacity = IN_MEMORY_DEFAULT_CAPACITY, - MemoryAllocator* allocator = default_allocator()); - - virtual ~InMemoryOutputStream(); - - // Close is currently a no-op with the in-memory stream - virtual void Close() {} - - virtual int64_t Tell(); - - virtual void Write(const uint8_t* data, int64_t length); - - // Return complete stream as Buffer - std::shared_ptr GetBuffer(); - - private: - // Mutable pointer to the current write position in the stream - uint8_t* Head(); - - std::shared_ptr buffer_; - int64_t size_; - int64_t capacity_; - - DISALLOW_COPY_AND_ASSIGN(InMemoryOutputStream); -}; - -class PARQUET_EXPORT LocalFileOutputStream : public OutputStream { - public: - explicit LocalFileOutputStream(const std::string& path); - - virtual ~LocalFileOutputStream(); - - // Close the output stream - void Close() override; - - // Return the current position in the output stream relative to the start - int64_t Tell() override; - - // Copy bytes into the output stream - void Write(const uint8_t* data, int64_t length) override; - - private: - void CloseFile(); - - FILE* file_; - bool is_open_; -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_OUTPUT_H diff --git a/cpp/src/parquet/util/rle-encoding.h b/cpp/src/parquet/util/rle-encoding.h index 7aba813224a76..d4be1fc7f5b9b 100644 --- a/cpp/src/parquet/util/rle-encoding.h +++ b/cpp/src/parquet/util/rle-encoding.h @@ -25,8 +25,8 @@ #include "parquet/util/bit-stream-utils.inline.h" #include "parquet/util/bit-util.h" -#include "parquet/util/buffer.h" #include "parquet/util/compiler-util.h" +#include "parquet/util/memory.h" namespace parquet {