-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Data] Support partition_cols in write_parquet #49411
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,7 +1,8 @@ | ||||||||||||||
import logging | ||||||||||||||
import posixpath | ||||||||||||||
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional | ||||||||||||||
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional | ||||||||||||||
|
||||||||||||||
from ray.data._internal.arrow_ops.transform_pyarrow import concat | ||||||||||||||
from ray.data._internal.execution.interfaces import TaskContext | ||||||||||||||
from ray.data._internal.util import call_with_retry | ||||||||||||||
from ray.data.block import Block, BlockAccessor | ||||||||||||||
|
@@ -24,6 +25,7 @@ def __init__( | |||||||||||||
self, | ||||||||||||||
path: str, | ||||||||||||||
*, | ||||||||||||||
partition_cols: Optional[List[str]] = None, | ||||||||||||||
arrow_parquet_args_fn: Optional[Callable[[], Dict[str, Any]]] = None, | ||||||||||||||
arrow_parquet_args: Optional[Dict[str, Any]] = None, | ||||||||||||||
num_rows_per_file: Optional[int] = None, | ||||||||||||||
|
@@ -42,6 +44,7 @@ def __init__( | |||||||||||||
self.arrow_parquet_args_fn = arrow_parquet_args_fn | ||||||||||||||
self.arrow_parquet_args = arrow_parquet_args | ||||||||||||||
self.num_rows_per_file = num_rows_per_file | ||||||||||||||
self.partition_cols = partition_cols | ||||||||||||||
|
||||||||||||||
super().__init__( | ||||||||||||||
path, | ||||||||||||||
|
@@ -59,7 +62,6 @@ def write( | |||||||||||||
ctx: TaskContext, | ||||||||||||||
) -> None: | ||||||||||||||
import pyarrow as pa | ||||||||||||||
import pyarrow.parquet as pq | ||||||||||||||
|
||||||||||||||
blocks = list(blocks) | ||||||||||||||
|
||||||||||||||
|
@@ -69,34 +71,102 @@ def write( | |||||||||||||
filename = self.filename_provider.get_filename_for_block( | ||||||||||||||
blocks[0], ctx.task_idx, 0 | ||||||||||||||
) | ||||||||||||||
write_path = posixpath.join(self.path, filename) | ||||||||||||||
write_kwargs = _resolve_kwargs( | ||||||||||||||
self.arrow_parquet_args_fn, **self.arrow_parquet_args | ||||||||||||||
) | ||||||||||||||
user_schema = write_kwargs.pop("schema", None) | ||||||||||||||
|
||||||||||||||
def write_blocks_to_path(): | ||||||||||||||
with self.open_output_stream(write_path) as file: | ||||||||||||||
tables = [BlockAccessor.for_block(block).to_arrow() for block in blocks] | ||||||||||||||
if user_schema is None: | ||||||||||||||
output_schema = pa.unify_schemas([table.schema for table in tables]) | ||||||||||||||
else: | ||||||||||||||
output_schema = user_schema | ||||||||||||||
tables = [BlockAccessor.for_block(block).to_arrow() for block in blocks] | ||||||||||||||
if user_schema is None: | ||||||||||||||
output_schema = pa.unify_schemas([table.schema for table in tables]) | ||||||||||||||
else: | ||||||||||||||
output_schema = user_schema | ||||||||||||||
|
||||||||||||||
with pq.ParquetWriter(file, output_schema, **write_kwargs) as writer: | ||||||||||||||
for table in tables: | ||||||||||||||
table = table.cast(output_schema) | ||||||||||||||
writer.write_table(table) | ||||||||||||||
if not self.partition_cols: | ||||||||||||||
self._write_single_file(tables, filename, output_schema, write_kwargs) | ||||||||||||||
else: # partition writes | ||||||||||||||
self._write_partition_files( | ||||||||||||||
tables, filename, output_schema, write_kwargs | ||||||||||||||
) | ||||||||||||||
|
||||||||||||||
logger.debug(f"Writing {filename} file to {self.path}.") | ||||||||||||||
|
||||||||||||||
logger.debug(f"Writing {write_path} file.") | ||||||||||||||
call_with_retry( | ||||||||||||||
write_blocks_to_path, | ||||||||||||||
description=f"write '{write_path}'", | ||||||||||||||
description=f"write '{filename}' to '{self.path}'", | ||||||||||||||
match=DataContext.get_current().retried_io_errors, | ||||||||||||||
max_attempts=WRITE_FILE_MAX_ATTEMPTS, | ||||||||||||||
max_backoff_s=WRITE_FILE_RETRY_MAX_BACKOFF_SECONDS, | ||||||||||||||
) | ||||||||||||||
|
||||||||||||||
def _write_single_file( | ||||||||||||||
self, | ||||||||||||||
tables: List["pyarrow.Table"], | ||||||||||||||
filename: str, | ||||||||||||||
output_schema: "pyarrow.Schema", | ||||||||||||||
write_kwargs: Dict[str, Any], | ||||||||||||||
) -> None: | ||||||||||||||
import pyarrow.parquet as pq | ||||||||||||||
|
||||||||||||||
write_path = posixpath.join(self.path, filename) | ||||||||||||||
with self.open_output_stream(write_path) as file: | ||||||||||||||
with pq.ParquetWriter(file, output_schema, **write_kwargs) as writer: | ||||||||||||||
for table in tables: | ||||||||||||||
table = table.cast(output_schema) | ||||||||||||||
writer.write_table(table) | ||||||||||||||
|
||||||||||||||
def _write_partition_files( | ||||||||||||||
self, | ||||||||||||||
tables: List["pyarrow.Table"], | ||||||||||||||
filename: str, | ||||||||||||||
output_schema: "pyarrow.Schema", | ||||||||||||||
write_kwargs: Dict[str, Any], | ||||||||||||||
) -> None: | ||||||||||||||
import pyarrow as pa | ||||||||||||||
import pyarrow.parquet as pq | ||||||||||||||
|
||||||||||||||
table = concat(tables) | ||||||||||||||
# Create unique combinations of the partition columns | ||||||||||||||
table_fields = [ | ||||||||||||||
field for field in output_schema if field.name not in self.partition_cols | ||||||||||||||
] | ||||||||||||||
non_partition_cols = [f.name for f in table_fields] | ||||||||||||||
output_schema = pa.schema( | ||||||||||||||
[field for field in output_schema if field.name not in self.partition_cols] | ||||||||||||||
) | ||||||||||||||
# Group the table by partition keys | ||||||||||||||
# For each partition key combination fetch list of values | ||||||||||||||
# for the non partition columns | ||||||||||||||
# Ex: Here original table contain | ||||||||||||||
# two columns (a, b). We are paritioning by column a. The schema | ||||||||||||||
# of `groups` grouped Table is as follows | ||||||||||||||
# b_list: [[[0,0],[1,1],[2,2]]] | ||||||||||||||
# a: [[1,2,3]] | ||||||||||||||
groups = table.group_by(self.partition_cols).aggregate( | ||||||||||||||
[(col_name, "list") for col_name in non_partition_cols] | ||||||||||||||
) | ||||||||||||||
grouped_keys = [groups.column(k) for k in self.partition_cols] | ||||||||||||||
|
||||||||||||||
for i in range(groups.num_rows): | ||||||||||||||
# See https://github.com/apache/arrow/issues/14882 for recommended approach | ||||||||||||||
values = [ | ||||||||||||||
groups.column(f"{col.name}_list")[i].values for col in table_fields | ||||||||||||||
] | ||||||||||||||
Comment on lines
+153
to
+155
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was confused because "col" refers to the string column name in
Suggested change
|
||||||||||||||
group_table = pa.Table.from_arrays(values, names=non_partition_cols) | ||||||||||||||
partition_path = "/".join( | ||||||||||||||
[ | ||||||||||||||
f"{col}={values[i]}" | ||||||||||||||
for col, values in zip(self.partition_cols, grouped_keys) | ||||||||||||||
] | ||||||||||||||
) | ||||||||||||||
write_path = posixpath.join(self.path, partition_path) | ||||||||||||||
self._create_dir(write_path) | ||||||||||||||
write_path = posixpath.join(write_path, filename) | ||||||||||||||
with self.open_output_stream(write_path) as file: | ||||||||||||||
with pq.ParquetWriter(file, output_schema, **write_kwargs) as writer: | ||||||||||||||
writer.write_table(group_table) | ||||||||||||||
|
||||||||||||||
@property | ||||||||||||||
def num_rows_per_write(self) -> Optional[int]: | ||||||||||||||
return self.num_rows_per_file |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -2976,6 +2976,7 @@ def write_parquet( | |||||||||
self, | ||||||||||
path: str, | ||||||||||
*, | ||||||||||
partition_cols: Optional[List[str]] = None, | ||||||||||
filesystem: Optional["pyarrow.fs.FileSystem"] = None, | ||||||||||
try_create_dir: bool = True, | ||||||||||
arrow_open_stream_args: Optional[Dict[str, Any]] = None, | ||||||||||
|
@@ -3009,6 +3010,8 @@ def write_parquet( | |||||||||
Args: | ||||||||||
path: The path to the destination root directory, where | ||||||||||
parquet files are written to. | ||||||||||
partition_cols: Column names by which to partition the dataset. | ||||||||||
Files are writted in Hive partition style. | ||||||||||
Comment on lines
+3013
to
+3014
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Use active voice (from our style guide: https://developers.google.com/style/voice) Also, typo with "writted"
Suggested change
|
||||||||||
filesystem: The pyarrow filesystem implementation to write to. | ||||||||||
These filesystems are specified in the | ||||||||||
`pyarrow docs <https://arrow.apache.org/docs\ | ||||||||||
|
@@ -3057,8 +3060,15 @@ def write_parquet( | |||||||||
if arrow_parquet_args_fn is None: | ||||||||||
arrow_parquet_args_fn = lambda: {} # noqa: E731 | ||||||||||
|
||||||||||
if partition_cols and num_rows_per_file: | ||||||||||
raise ValueError( | ||||||||||
"Cannot pass num_rows_per_file when partition_cols " | ||||||||||
"argument is specified" | ||||||||||
) | ||||||||||
|
||||||||||
datasink = ParquetDatasink( | ||||||||||
path, | ||||||||||
partition_cols=partition_cols, | ||||||||||
arrow_parquet_args_fn=arrow_parquet_args_fn, | ||||||||||
arrow_parquet_args=arrow_parquet_args, | ||||||||||
num_rows_per_file=num_rows_per_file, | ||||||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -79,6 +79,9 @@ def open_output_stream(self, path: str) -> "pyarrow.NativeFile": | |||||
return self.filesystem.open_output_stream(path, **self.open_stream_args) | ||||||
|
||||||
def on_write_start(self) -> None: | ||||||
self.has_created_dir = self._create_dir(self.path) | ||||||
|
||||||
def _create_dir(self, dest) -> bool: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we use the
Suggested change
|
||||||
"""Create a directory to write files to. | ||||||
|
||||||
If ``try_create_dir`` is ``False``, this method is a no-op. | ||||||
|
@@ -96,19 +99,21 @@ def on_write_start(self) -> None: | |||||
# a policy only allows users to write blobs prefixed with s3://bucket/foo | ||||||
# a call to create_dir for s3://bucket/foo/bar will fail even though it | ||||||
# should not. | ||||||
parsed_uri = urlparse(self.path) | ||||||
parsed_uri = urlparse(dest) | ||||||
is_s3_uri = parsed_uri.scheme == "s3" | ||||||
skip_create_dir_for_s3 = ( | ||||||
is_s3_uri and not DataContext.get_current().s3_try_create_dir | ||||||
) | ||||||
|
||||||
if self.try_create_dir and not skip_create_dir_for_s3: | ||||||
if self.filesystem.get_file_info(self.path).type is FileType.NotFound: | ||||||
if self.filesystem.get_file_info(dest).type is FileType.NotFound: | ||||||
# Arrow's S3FileSystem doesn't allow creating buckets by default, so we | ||||||
# add a query arg enabling bucket creation if an S3 URI is provided. | ||||||
tmp = _add_creatable_buckets_param_if_s3_uri(self.path) | ||||||
tmp = _add_creatable_buckets_param_if_s3_uri(dest) | ||||||
self.filesystem.create_dir(tmp, recursive=True) | ||||||
self.has_created_dir = True | ||||||
return True | ||||||
|
||||||
return False | ||||||
|
||||||
def write( | ||||||
self, | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.