Skip to content

Commit ce58932

Browse files
authored
parquet: Add ArrowWriterOptions to skip embedding the arrow metadata (#5299)
* feat(parquet): Add ArrowWriterOptions * test(parquet): test skip_arrow_metadata * feat(parquet): Add try_new_with_options to async writer * refactor: move WriterProperties to ArrowWriterOptions
1 parent b03613e commit ce58932

File tree

2 files changed

+106
-4
lines changed

2 files changed

+106
-4
lines changed

parquet/src/arrow/arrow_writer/mod.rs

+84-3
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,27 @@ impl<W: Write + Send> ArrowWriter<W> {
119119
writer: W,
120120
arrow_schema: SchemaRef,
121121
props: Option<WriterProperties>,
122+
) -> Result<Self> {
123+
let options = ArrowWriterOptions::new().with_properties(props.unwrap_or_default());
124+
Self::try_new_with_options(writer, arrow_schema, options)
125+
}
126+
127+
/// Try to create a new Arrow writer with [`ArrowWriterOptions`].
128+
///
129+
/// The writer will fail if:
130+
/// * a `SerializedFileWriter` cannot be created from the ParquetWriter
131+
/// * the Arrow schema contains unsupported datatypes such as Unions
132+
pub fn try_new_with_options(
133+
writer: W,
134+
arrow_schema: SchemaRef,
135+
options: ArrowWriterOptions,
122136
) -> Result<Self> {
123137
let schema = arrow_to_parquet_schema(&arrow_schema)?;
124-
// add serialized arrow schema
125-
let mut props = props.unwrap_or_default();
126-
add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props);
138+
let mut props = options.properties;
139+
if !options.skip_arrow_metadata {
140+
// add serialized arrow schema
141+
add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props);
142+
}
127143

128144
let max_row_group_size = props.max_row_group_size();
129145

@@ -245,6 +261,38 @@ impl<W: Write + Send> RecordBatchWriter for ArrowWriter<W> {
245261
}
246262
}
247263

264+
/// Arrow-specific configuration settings for writing parquet files.
265+
///
266+
/// See [`ArrowWriter`] for how to configure the writer.
267+
#[derive(Debug, Clone, Default)]
268+
pub struct ArrowWriterOptions {
269+
properties: WriterProperties,
270+
skip_arrow_metadata: bool,
271+
}
272+
273+
impl ArrowWriterOptions {
274+
/// Creates a new [`ArrowWriterOptions`] with the default settings.
275+
pub fn new() -> Self {
276+
Self::default()
277+
}
278+
279+
/// Sets the [`WriterProperties`] for writing parquet files.
280+
pub fn with_properties(self, properties: WriterProperties) -> Self {
281+
Self { properties, ..self }
282+
}
283+
284+
/// Parquet files generated by the [`ArrowWriter`] contain embedded arrow schema
285+
/// by default.
286+
///
287+
/// Set `skip_arrow_metadata` to true, to skip encoding this.
288+
pub fn with_skip_arrow_metadata(self, skip_arrow_metadata: bool) -> Self {
289+
Self {
290+
skip_arrow_metadata,
291+
..self
292+
}
293+
}
294+
}
295+
248296
/// A single column chunk produced by [`ArrowColumnWriter`]
249297
#[derive(Default)]
250298
struct ArrowColumnChunkData {
@@ -904,6 +952,7 @@ mod tests {
904952
use std::sync::Arc;
905953

906954
use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
955+
use crate::arrow::ARROW_SCHEMA_META_KEY;
907956
use arrow::datatypes::ToByteSlice;
908957
use arrow::datatypes::{DataType, Field, Schema, UInt32Type, UInt8Type};
909958
use arrow::error::Result as ArrowResult;
@@ -2882,4 +2931,36 @@ mod tests {
28822931
let b_idx = &column_index[0][1];
28832932
assert!(matches!(b_idx, Index::NONE), "{b_idx:?}");
28842933
}
2934+
2935+
#[test]
2936+
fn test_arrow_writer_skip_metadata() {
2937+
let batch_schema = Schema::new(vec![Field::new("int32", DataType::Int32, false)]);
2938+
let file_schema = Arc::new(batch_schema.clone());
2939+
2940+
let batch = RecordBatch::try_new(
2941+
Arc::new(batch_schema),
2942+
vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _],
2943+
)
2944+
.unwrap();
2945+
let skip_options = ArrowWriterOptions::new().with_skip_arrow_metadata(true);
2946+
2947+
let mut buf = Vec::with_capacity(1024);
2948+
let mut writer =
2949+
ArrowWriter::try_new_with_options(&mut buf, file_schema.clone(), skip_options).unwrap();
2950+
writer.write(&batch).unwrap();
2951+
writer.close().unwrap();
2952+
2953+
let bytes = Bytes::from(buf);
2954+
let reader_builder = ParquetRecordBatchReaderBuilder::try_new(bytes).unwrap();
2955+
assert_eq!(file_schema, *reader_builder.schema());
2956+
if let Some(key_value_metadata) = reader_builder
2957+
.metadata()
2958+
.file_metadata()
2959+
.key_value_metadata()
2960+
{
2961+
assert!(!key_value_metadata
2962+
.iter()
2963+
.any(|kv| kv.key.as_str() == ARROW_SCHEMA_META_KEY));
2964+
}
2965+
}
28852966
}

parquet/src/arrow/async_writer/mod.rs

+22-1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
use std::{io::Write, sync::Arc};
5555

5656
use crate::{
57+
arrow::arrow_writer::ArrowWriterOptions,
5758
arrow::ArrowWriter,
5859
errors::{ParquetError, Result},
5960
file::properties::WriterProperties,
@@ -97,9 +98,29 @@ impl<W: AsyncWrite + Unpin + Send> AsyncArrowWriter<W> {
9798
arrow_schema: SchemaRef,
9899
buffer_size: usize,
99100
props: Option<WriterProperties>,
101+
) -> Result<Self> {
102+
let options = ArrowWriterOptions::new().with_properties(props.unwrap_or_default());
103+
Self::try_new_with_options(writer, arrow_schema, buffer_size, options)
104+
}
105+
106+
/// Try to create a new Async Arrow Writer with [`ArrowWriterOptions`].
107+
///
108+
/// `buffer_size` determines the number of bytes to buffer before flushing
109+
/// to the underlying [`AsyncWrite`]
110+
///
111+
/// The intermediate buffer will automatically be resized if necessary
112+
///
113+
/// [`Self::write`] will flush this intermediate buffer if it is at least
114+
/// half full
115+
pub fn try_new_with_options(
116+
writer: W,
117+
arrow_schema: SchemaRef,
118+
buffer_size: usize,
119+
options: ArrowWriterOptions,
100120
) -> Result<Self> {
101121
let shared_buffer = SharedBuffer::new(buffer_size);
102-
let sync_writer = ArrowWriter::try_new(shared_buffer.clone(), arrow_schema, props)?;
122+
let sync_writer =
123+
ArrowWriter::try_new_with_options(shared_buffer.clone(), arrow_schema, options)?;
103124

104125
Ok(Self {
105126
sync_writer,

0 commit comments

Comments
 (0)