Skip to content

Commit

Permalink
[FEA] Expose stripe_size_rows setting for ORCWriterOptions (#17927)
Browse files Browse the repository at this point in the history
closes #17785

This PR exposes the `stripe_size_rows` setting for the `ORCWriterOptions` Java interface.

Exposing this interface is solely for the convenience of conducting some tests.

Authors:
  - https://github.com/ustcfy
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Chong Gao (https://github.com/res-life)

URL: #17927
  • Loading branch information
ustcfy authored Feb 24, 2025
1 parent 4be30a1 commit d0e219e
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 1 deletion.
19 changes: 18 additions & 1 deletion java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,17 +23,34 @@
* that will be used by the ORC writer to write the file.
*/
public class ORCWriterOptions extends CompressionMetadataWriterOptions {
private int stripeSizeRows;

private ORCWriterOptions(Builder builder) {
super(builder);
this.stripeSizeRows = builder.stripeSizeRows;
}

public static Builder builder() {
return new Builder();
}

public int getStripeSizeRows() {
return stripeSizeRows;
}

public static class Builder extends CompressionMetadataWriterOptions.Builder
<Builder, ORCWriterOptions> {
// < 1M rows default orc stripe rows, defined in cudf/cpp/include/cudf/io/orc.hpp
private int stripeSizeRows = 1000000;

public Builder withStripeSizeRows(int stripeSizeRows) {
// maximum stripe size cannot be smaller than 512
if (stripeSizeRows < 512) {
throw new IllegalArgumentException("Maximum stripe size cannot be smaller than 512");
}
this.stripeSizeRows = stripeSizeRows;
return this;
}

public ORCWriterOptions build() {
return new ORCWriterOptions(this);
Expand Down
4 changes: 4 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ private static native long writeORCFileBegin(String[] columnNames,
int compression,
int[] precisions,
boolean[] isMapValues,
int stripeSizeRows,
String filename) throws CudfException;

/**
Expand All @@ -501,6 +502,7 @@ private static native long writeORCBufferBegin(String[] columnNames,
int compression,
int[] precisions,
boolean[] isMapValues,
int stripeSizeRows,
HostBufferConsumer consumer,
HostMemoryAllocator hostMemoryAllocator
) throws CudfException;
Expand Down Expand Up @@ -1823,6 +1825,7 @@ private ORCTableWriter(ORCWriterOptions options, File outputFile) {
options.getCompressionType().nativeId,
options.getFlatPrecision(),
options.getFlatIsMap(),
options.getStripeSizeRows(),
outputFile.getAbsolutePath()));
this.consumer = null;
}
Expand All @@ -1838,6 +1841,7 @@ private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer,
options.getCompressionType().nativeId,
options.getFlatPrecision(),
options.getFlatIsMap(),
options.getStripeSizeRows(),
consumer, hostMemoryAllocator));
this.consumer = consumer;
}
Expand Down
4 changes: 4 additions & 0 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2480,6 +2480,7 @@ Java_ai_rapids_cudf_Table_writeORCBufferBegin(JNIEnv* env,
jint j_compression,
jintArray j_precisions,
jbooleanArray j_is_map,
jint j_stripe_size_rows,
jobject consumer,
jobject host_memory_allocator)
{
Expand Down Expand Up @@ -2535,6 +2536,7 @@ Java_ai_rapids_cudf_Table_writeORCBufferBegin(JNIEnv* env,
.enable_statistics(ORC_STATISTICS_ROW_GROUP)
.key_value_metadata(kv_metadata)
.compression_statistics(stats)
.stripe_size_rows(j_stripe_size_rows)
.build();
auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
cudf::jni::native_orc_writer_handle* ret = new cudf::jni::native_orc_writer_handle(
Expand All @@ -2555,6 +2557,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(JNIEnv* env,
jint j_compression,
jintArray j_precisions,
jbooleanArray j_is_map,
jint j_stripe_size_rows,
jstring j_output_path)
{
JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
Expand Down Expand Up @@ -2606,6 +2609,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(JNIEnv* env,
.enable_statistics(ORC_STATISTICS_ROW_GROUP)
.key_value_metadata(kv_metadata)
.compression_statistics(stats)
.stripe_size_rows(j_stripe_size_rows)
.build();
auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
cudf::jni::native_orc_writer_handle* ret =
Expand Down

0 comments on commit d0e219e

Please sign in to comment.