Skip to content

Commit

Permalink
Bump version to 0.9.3.19; refactor schema unification logic and add u…
Browse files Browse the repository at this point in the history
…nify_schemas_pl function
  • Loading branch information
legout committed Dec 18, 2024
1 parent 09934fa commit 269b921
Show file tree
Hide file tree
Showing 5 changed files with 1,206 additions and 6 deletions.
26 changes: 26 additions & 0 deletions pydala/helpers/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from .datetime import timestamp_from_string
from .polars import pl
from ..schema import convert_large_types_to_normal

# Compile regex patterns once for efficiency
SPLIT_PATTERN = re.compile(
Expand Down Expand Up @@ -584,3 +585,28 @@ def get_nested_keys(d, parent_key=""):
if isinstance(v, dict):
keys.extend(get_nested_keys(v, new_key))
return keys


def unify_schemas_pl(
schemas: list[pa.Schema], convert_large_types: bool = True
) -> pl.Schema:
"""
Unifies a list of Pyarrow schemas into a single schema.
Args:
schemas (list[pl.Schema]): List of Polars schemas.
Returns:
pa.Schema: Unified schema.
"""
schema = (
pl.concat(
[pa.Table.from_pydict([], schema=schema) for schema in schemas],
how="vertical_relaxed",
)
.to_arrow()
.schema
)
if convert_large_types:
schema = convert_large_types_to_normal(schema)
return schema
1 change: 0 additions & 1 deletion pydala/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@ def cast_schema(
alter_schema (bool, optional): Whether to alter the schema. Defaults to False.
"""

self._to_arrow()
self._set_schema()
self._use_large_string = use_large_string
if not use_large_string:
Expand Down
12 changes: 8 additions & 4 deletions pydala/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from loguru import logger
from .filesystem import FileSystem, clear_cache


# from .helpers.metadata import collect_parquet_metadata # , remove_from_metadata
from .helpers.misc import get_partitions_from_path, run_parallel
from .helpers.misc import get_partitions_from_path, run_parallel, unify_schemas_pl
from .schema import repair_schema, convert_large_types_to_normal # unify_schemas


Expand Down Expand Up @@ -368,10 +369,13 @@ def _get_unified_schema(

if self.has_metadata:
schemas.insert(0, self.metadata.schema.to_arrow_schema())
try:
unified_schema = convert_large_types_to_normal(
pa.unify_schemas(schemas, promote_options="permissive")
)
except pa.lib.ArrowTypeError:
unified_schema = unify_schemas_pl(schemas, convert_large_types=True)

unified_schema = convert_large_types_to_normal(
pa.unify_schemas(schemas, promote_options="permissive")
)
schemas_equal = all([unified_schema == schema for schema in schemas])
else:
unified_schema = convert_large_types_to_normal(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ name = "pydala2"
readme = "README.md"
#repository = "https://github.com/legout/pydala2"
requires-python = ">= 3.10"
version = "0.9.3.18"
version = "0.9.3.19"

[project.optional-dependencies]
legacy = ["polars-lts-cpu>=0.20.4"]
Expand Down
Loading

0 comments on commit 269b921

Please sign in to comment.