@@ -119,11 +119,27 @@ impl<W: Write + Send> ArrowWriter<W> {
119
119
writer : W ,
120
120
arrow_schema : SchemaRef ,
121
121
props : Option < WriterProperties > ,
122
+ ) -> Result < Self > {
123
+ let options = ArrowWriterOptions :: new ( ) . with_properties ( props. unwrap_or_default ( ) ) ;
124
+ Self :: try_new_with_options ( writer, arrow_schema, options)
125
+ }
126
+
127
+ /// Try to create a new Arrow writer with [`ArrowWriterOptions`].
128
+ ///
129
+ /// The writer will fail if:
130
+ /// * a `SerializedFileWriter` cannot be created from the ParquetWriter
131
+ /// * the Arrow schema contains unsupported datatypes such as Unions
132
+ pub fn try_new_with_options (
133
+ writer : W ,
134
+ arrow_schema : SchemaRef ,
135
+ options : ArrowWriterOptions ,
122
136
) -> Result < Self > {
123
137
let schema = arrow_to_parquet_schema ( & arrow_schema) ?;
124
- // add serialized arrow schema
125
- let mut props = props. unwrap_or_default ( ) ;
126
- add_encoded_arrow_schema_to_metadata ( & arrow_schema, & mut props) ;
138
+ let mut props = options. properties ;
139
+ if !options. skip_arrow_metadata {
140
+ // add serialized arrow schema
141
+ add_encoded_arrow_schema_to_metadata ( & arrow_schema, & mut props) ;
142
+ }
127
143
128
144
let max_row_group_size = props. max_row_group_size ( ) ;
129
145
@@ -245,6 +261,38 @@ impl<W: Write + Send> RecordBatchWriter for ArrowWriter<W> {
245
261
}
246
262
}
247
263
264
+ /// Arrow-specific configuration settings for writing parquet files.
265
+ ///
266
+ /// See [`ArrowWriter`] for how to configure the writer.
267
+ #[ derive( Debug , Clone , Default ) ]
268
+ pub struct ArrowWriterOptions {
269
+ properties : WriterProperties ,
270
+ skip_arrow_metadata : bool ,
271
+ }
272
+
273
+ impl ArrowWriterOptions {
274
+ /// Creates a new [`ArrowWriterOptions`] with the default settings.
275
+ pub fn new ( ) -> Self {
276
+ Self :: default ( )
277
+ }
278
+
279
+ /// Sets the [`WriterProperties`] for writing parquet files.
280
+ pub fn with_properties ( self , properties : WriterProperties ) -> Self {
281
+ Self { properties, ..self }
282
+ }
283
+
284
+ /// Parquet files generated by the [`ArrowWriter`] contain embedded arrow schema
285
+ /// by default.
286
+ ///
287
+ /// Set `skip_arrow_metadata` to true, to skip encoding this.
288
+ pub fn with_skip_arrow_metadata ( self , skip_arrow_metadata : bool ) -> Self {
289
+ Self {
290
+ skip_arrow_metadata,
291
+ ..self
292
+ }
293
+ }
294
+ }
295
+
248
296
/// A single column chunk produced by [`ArrowColumnWriter`]
249
297
#[ derive( Default ) ]
250
298
struct ArrowColumnChunkData {
@@ -904,6 +952,7 @@ mod tests {
904
952
use std:: sync:: Arc ;
905
953
906
954
use crate :: arrow:: arrow_reader:: { ParquetRecordBatchReader , ParquetRecordBatchReaderBuilder } ;
955
+ use crate :: arrow:: ARROW_SCHEMA_META_KEY ;
907
956
use arrow:: datatypes:: ToByteSlice ;
908
957
use arrow:: datatypes:: { DataType , Field , Schema , UInt32Type , UInt8Type } ;
909
958
use arrow:: error:: Result as ArrowResult ;
@@ -2882,4 +2931,36 @@ mod tests {
2882
2931
let b_idx = & column_index[ 0 ] [ 1 ] ;
2883
2932
assert ! ( matches!( b_idx, Index :: NONE ) , "{b_idx:?}" ) ;
2884
2933
}
2934
+
2935
+ #[ test]
2936
+ fn test_arrow_writer_skip_metadata ( ) {
2937
+ let batch_schema = Schema :: new ( vec ! [ Field :: new( "int32" , DataType :: Int32 , false ) ] ) ;
2938
+ let file_schema = Arc :: new ( batch_schema. clone ( ) ) ;
2939
+
2940
+ let batch = RecordBatch :: try_new (
2941
+ Arc :: new ( batch_schema) ,
2942
+ vec ! [ Arc :: new( Int32Array :: from( vec![ 1 , 2 , 3 , 4 ] ) ) as _] ,
2943
+ )
2944
+ . unwrap ( ) ;
2945
+ let skip_options = ArrowWriterOptions :: new ( ) . with_skip_arrow_metadata ( true ) ;
2946
+
2947
+ let mut buf = Vec :: with_capacity ( 1024 ) ;
2948
+ let mut writer =
2949
+ ArrowWriter :: try_new_with_options ( & mut buf, file_schema. clone ( ) , skip_options) . unwrap ( ) ;
2950
+ writer. write ( & batch) . unwrap ( ) ;
2951
+ writer. close ( ) . unwrap ( ) ;
2952
+
2953
+ let bytes = Bytes :: from ( buf) ;
2954
+ let reader_builder = ParquetRecordBatchReaderBuilder :: try_new ( bytes) . unwrap ( ) ;
2955
+ assert_eq ! ( file_schema, * reader_builder. schema( ) ) ;
2956
+ if let Some ( key_value_metadata) = reader_builder
2957
+ . metadata ( )
2958
+ . file_metadata ( )
2959
+ . key_value_metadata ( )
2960
+ {
2961
+ assert ! ( !key_value_metadata
2962
+ . iter( )
2963
+ . any( |kv| kv. key. as_str( ) == ARROW_SCHEMA_META_KEY ) ) ;
2964
+ }
2965
+ }
2885
2966
}
0 commit comments