Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-36765: [Python][Dataset] Change default of pre_buffer to True for reading Parquet files #37854

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2354,10 +2354,11 @@ void TestGetRecordBatchReader(

TEST(TestArrowReadWrite, GetRecordBatchReader) { TestGetRecordBatchReader(); }

// Same as the test above, but using coalesced reads.
TEST(TestArrowReadWrite, CoalescedReads) {
// Same as the test above, but using non-coalesced reads.
TEST(TestArrowReadWrite, NoneCoalescedReads) {
ArrowReaderProperties arrow_properties = default_arrow_reader_properties();
arrow_properties.set_pre_buffer(true);
arrow_properties.set_pre_buffer(false);
arrow_properties.set_cache_options(::arrow::io::CacheOptions::Defaults())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might cause compiling failed, I submit a fixing: #38069

TestGetRecordBatchReader(arrow_properties);
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -831,8 +831,8 @@ class PARQUET_EXPORT ArrowReaderProperties {
: use_threads_(use_threads),
read_dict_indices_(),
batch_size_(kArrowDefaultBatchSize),
pre_buffer_(false),
cache_options_(::arrow::io::CacheOptions::Defaults()),
pre_buffer_(true),
cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}

/// \brief Set whether to use the IO thread pool to parse columns in parallel.
Expand Down
9 changes: 6 additions & 3 deletions python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -666,10 +666,13 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
Disabled by default.
buffer_size : int, default 8192
Size of buffered stream, if enabled. Default is 8KB.
pre_buffer : bool, default False
pre_buffer : bool, default True
If enabled, pre-buffer the raw Parquet data instead of issuing one
read per column chunk. This can improve performance on high-latency
filesystems.
filesystems (e.g. S3, GCS) by coalesing and issuing file reads in
parallel using a background I/O thread pool.
Set to False if you want to prioritize minimal memory usage
over maximum speed.
thrift_string_size_limit : int, default None
If not None, override the maximum total string size allocated
when decoding Thrift structures. The default limit should be
Expand All @@ -688,7 +691,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):

def __init__(self, *, bint use_buffered_stream=False,
buffer_size=8192,
bint pre_buffer=False,
bint pre_buffer=True,
thrift_string_size_limit=None,
thrift_container_size_limit=None):
self.init(shared_ptr[CFragmentScanOptions](
Expand Down
5 changes: 3 additions & 2 deletions python/pyarrow/parquet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1742,11 +1742,12 @@ class ParquetDataset:
different partitioning schemes, etc.
pre_buffer : bool, default True
Coalesce and issue file reads in parallel to improve performance on
high-latency filesystems (e.g. S3). If True, Arrow will use a
high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a
background I/O thread pool. This option is only supported for
use_legacy_dataset=False. If using a filesystem layer that itself
performs readahead (e.g. fsspec's S3FS), disable readahead for best
results.
results. Set to False if you want to prioritize minimal memory usage
over maximum speed.
coerce_int96_timestamp_unit : str, default None
Cast timestamps that are stored in INT96 format to a particular resolution
(e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96
Expand Down
10 changes: 5 additions & 5 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,28 +784,28 @@ def test_parquet_scan_options():
opts2 = ds.ParquetFragmentScanOptions(buffer_size=4096)
opts3 = ds.ParquetFragmentScanOptions(
buffer_size=2**13, use_buffered_stream=True)
opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=True)
opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=False)
opts5 = ds.ParquetFragmentScanOptions(
thrift_string_size_limit=123456,
thrift_container_size_limit=987654,)

assert opts1.use_buffered_stream is False
assert opts1.buffer_size == 2**13
assert opts1.pre_buffer is False
assert opts1.pre_buffer is True
assert opts1.thrift_string_size_limit == 100_000_000 # default in C++
assert opts1.thrift_container_size_limit == 1_000_000 # default in C++

assert opts2.use_buffered_stream is False
assert opts2.buffer_size == 2**12
assert opts2.pre_buffer is False
assert opts2.pre_buffer is True

assert opts3.use_buffered_stream is True
assert opts3.buffer_size == 2**13
assert opts3.pre_buffer is False
assert opts3.pre_buffer is True

assert opts4.use_buffered_stream is False
assert opts4.buffer_size == 2**13
assert opts4.pre_buffer is True
assert opts4.pre_buffer is False

assert opts5.thrift_string_size_limit == 123456
assert opts5.thrift_container_size_limit == 987654
Expand Down