diff --git a/parquet/src/arrow/arrow_reader.rs b/parquet/src/arrow/arrow_reader.rs index fada28daaae2..259a3c08e586 100644 --- a/parquet/src/arrow/arrow_reader.rs +++ b/parquet/src/arrow/arrow_reader.rs @@ -578,6 +578,11 @@ mod tests { record_batch_size: usize, /// Percentage of nulls in column or None if required null_percent: Option, + /// Set write batch size + /// + /// This is the number of rows that are written at once to a page and + /// therefore acts as a bound on the page granularity of a row group + write_batch_size: usize, /// Maximum size of page in bytes max_data_page_size: usize, /// Maximum size of dictionary page in bytes @@ -595,6 +600,7 @@ mod tests { num_rows: 100, record_batch_size: 15, null_percent: None, + write_batch_size: 64, max_data_page_size: 1024 * 1024, max_dict_page_size: 1024 * 1024, writer_version: WriterVersion::PARQUET_1_0, @@ -637,6 +643,7 @@ mod tests { fn writer_props(&self) -> WriterProperties { let builder = WriterProperties::builder() .set_data_pagesize_limit(self.max_data_page_size) + .set_write_batch_size(self.write_batch_size) .set_writer_version(self.writer_version); let builder = match self.encoding { diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs index 87b25b4d3c70..1db0ea0ccb1b 100644 --- a/parquet/src/column/writer.rs +++ b/parquet/src/column/writer.rs @@ -567,6 +567,14 @@ impl ColumnWriterImpl { /// Returns true if there is enough data for a data page, false otherwise. #[inline] fn should_add_data_page(&self) -> bool { + // This is necessary in the event of a much larger dictionary size than page size + // + // In such a scenario the dictionary decoder may return an estimated encoded + // size in excess of the page size limit, even when there are no buffered values + if self.num_buffered_values == 0 { + return false; + } + match self.dict_encoder { Some(ref encoder) => { encoder.estimated_data_encoded_size() >= self.props.data_pagesize_limit()