Update version to 0.9.3.16 and replace shrink_large_string with convert_large_types_to_normal in schema handling

legout · legout · commit b22c10f56042 · 2024-12-11T13:15:48.000+01:00
diff --git a/pydala/dataset.py b/pydala/dataset.py
@@ -20,7 +20,7 @@
 from .io import Writer
 from .metadata import ParquetDatasetMetadata, PydalaDatasetMetadata
 from .schema import replace_schema  # from .optimize import Optimize
-from .schema import shrink_large_string
+from .schema import convert_large_types_to_normal
 from .table import PydalaTable
 
 
@@ -630,7 +630,7 @@ def write_to_dataset(
         ts_unit: str = "us",
         tz: str | None = None,
         remove_tz: bool = False,
-        use_large_string: bool = False,
+        # use_large_string: bool = False,
         delta_subset: str | list[str] | None = None,
         alter_schema: bool = False,
         timestamp_column: str | None = None,
@@ -662,7 +662,6 @@ def write_to_dataset(
         - ts_unit: The unit of the timestamp column. Defaults to "us".
         - tz: The timezone to be used for the timestamp column. Defaults to None.
         - remove_tz: Whether to remove the timezone information from the timestamp column. Defaults to False.
-        - use_large_string: Whether to use large string type for string columns. Defaults to False.
         - delta_subset: The subset of columns to consider for delta updates. Can be a string, a list of strings, or
             None. Defaults to None.
         - alter_schema: Whether to alter the schema of the dataset. Defaults to False.
@@ -711,7 +710,7 @@ def write_to_dataset(
                 )
 
             writer.cast_schema(
-                use_large_string=use_large_string,
+                # use_large_string=use_large_string,
                 ts_unit=ts_unit,
                 tz=tz,
                 remove_tz=remove_tz,
@@ -819,7 +818,7 @@ def load(
         schema: pa.Schema | None = None,
         ts_unit: str = "us",
         tz: str | None = None,
-        use_large_string: bool = False,
+        # use_large_types: bool = False,
         format_version: str = "2.6",
         verbose: bool = False,
         **kwargs,
@@ -833,7 +832,6 @@ def load(
             schema (pa.Schema | None, optional): The schema of the data. Defaults to None.
             ts_unit (str, optional): The unit of the timestamp. Defaults to "us".
             tz (str | None, optional): The timezone. Defaults to None.
-            use_large_string (bool, optional): Whether to use large string. Defaults to False.
             format_version (str, optional): The version of the data format. Defaults to "2.6".
             **kwargs: Additional keyword arguments.
 
@@ -856,7 +854,6 @@ def load(
                 schema=schema,
                 ts_unit=ts_unit,
                 tz=tz,
-                use_large_string=use_large_string,
                 format_version=format_version,
                 verbose=verbose,
                 **kwargs,
@@ -974,7 +971,6 @@ def write_to_dataset(
         ts_unit: str = "us",
         tz: str | None = None,
         remove_tz: bool = False,
-        use_large_string: bool = False,
         delta_subset: str | list[str] | None = None,
         update_metadata: bool = False,
         alter_schema: bool = False,
@@ -1007,7 +1003,6 @@ def write_to_dataset(
         - ts_unit: The unit of the timestamp column. Defaults to "us".
         - tz: The timezone to be used for the timestamp column. Defaults to None.
         - remove_tz: Whether to remove the timezone information from the timestamp column. Defaults to False.
-        - use_large_string: Whether to use large string type for string columns. Defaults to False.
         - delta_subset: The subset of columns to consider for delta updates. Can be a string, a list of strings, or
             None. Defaults to None.
         - update_metadata: Whether to update the metadata table after writing. Defaults to False.
@@ -1099,7 +1094,6 @@ def write_to_dataset(
             ts_unit=ts_unit,
             tz=tz,
             remove_tz=remove_tz,
-            use_large_string=use_large_string,
             delta_subset=delta_subset,
             alter_schema=alter_schema,
             timestamp_column=timestamp_column,
@@ -1560,7 +1554,6 @@ def optimize_dtypes(
         include: str | list[str] | None = None,
         ts_unit: str | None = None,  # "us",
         tz: str | None = None,
-        use_large_string: bool = False,
         infer_schema_size: int = 10_000,
         **kwargs,
     ):
@@ -1572,8 +1565,8 @@ def optimize_dtypes(
             .to_arrow()
             .schema
         )
-        if not use_large_string:
-            optimized_schema = shrink_large_string(optimized_schema)
+
+        optimized_schema = convert_large_types_to_normal(optimized_schema)
 
         for file_path in tqdm.tqdm(self.files):
             self._optimize_dtypes(
@@ -1584,7 +1577,7 @@ def optimize_dtypes(
                 include=include,
                 ts_unit=ts_unit,
                 tz=tz,
-                use_large_string=use_large_string,
+                # use_large_string=use_large_string,
                 **kwargs,
             )
 
diff --git a/pydala/filesystem.py b/pydala/filesystem.py
@@ -24,7 +24,7 @@
 from loguru import logger
 
 from .helpers.misc import read_table, run_parallel
-from .schema import shrink_large_string
+from .schema import convert_large_types_to_normal
 
 
 def get_credentials_from_fssspec(fs: AbstractFileSystem) -> dict[str, str]:
@@ -452,7 +452,7 @@ def write_parquet(
 ) -> None:
     if isinstance(data, pl.DataFrame):
         data = data.to_arrow()
-        data = data.cast(shrink_large_string(data.schema))
+        data = data.cast(convert_large_types_to_normal(data.schema))
     elif isinstance(data, pd.DataFrame):
         data = pa.Table.from_pandas(data, preserve_index=False)
     elif isinstance(data, ddb.DuckDBPyRelation):
@@ -468,7 +468,7 @@ def write_json(
 ) -> None:
     if isinstance(data, pl.DataFrame):
         data = data.to_arrow()
-        data = data.cast(shrink_large_string(data.schema)).to_pydict()
+        data = data.cast(convert_large_types_to_normal(data.schema)).to_pydict()
     elif isinstance(data, pd.DataFrame):
         data = pa.Table.from_pandas(data, preserve_index=False).to_pydict()
     elif isinstance(data, ddb.DuckDBPyRelation):
@@ -523,7 +523,7 @@ def write_to_pyarrow_dataset(
 
     if isinstance(data[0], pl.DataFrame):
         data = [dd.to_arrow() for dd in data]
-        data = [dd.cast(shrink_large_string(dd.schema)) for dd in data]
+        data = [dd.cast(convert_large_types_to_normal(dd.schema)) for dd in data]
 
     elif isinstance(data[0], pd.DataFrame):
         data = [pa.Table.from_pandas(dd, preserve_index=False) for dd in data]
diff --git a/pydala/io.py b/pydala/io.py
@@ -17,7 +17,7 @@
 from .filesystem import clear_cache
 from .helpers.datetime import get_timestamp_column
 from .helpers.polars import pl
-from .schema import convert_timestamp, replace_schema, shrink_large_string
+from .schema import convert_timestamp, replace_schema, convert_large_types_to_normal
 from .table import PydalaTable
 
 
@@ -270,7 +270,7 @@ def cast_schema(
         self._set_schema()
         self._use_large_string = use_large_string
         if not use_large_string:
-            self.schema = shrink_large_string(self.schema)
+            self.schema = convert_large_types_to_normal(self.schema)
 
         if tz is not None or ts_unit is not None or remove_tz:
             self.schema = convert_timestamp(
diff --git a/pydala/metadata.py b/pydala/metadata.py
@@ -16,7 +16,7 @@
 
 # from .helpers.metadata import collect_parquet_metadata  # , remove_from_metadata
 from .helpers.misc import get_partitions_from_path, run_parallel
-from .schema import repair_schema  # unify_schemas
+from .schema import repair_schema, convert_large_types_to_normal  # unify_schemas
 
 
 def collect_parquet_metadata(
@@ -308,7 +308,12 @@ def update_file_metadata(
                 new_files += files
 
         if new_files:
+            if verbose:
+                logger.info(f"Collecting metadata for {len(new_files)} new files.")
             self._collect_file_metadata(files=new_files, verbose=verbose, **kwargs)
+        else:
+            if verbose:
+                logger.info("No new files to collect metadata for.")
 
         if rm_files:
             self._rm_file_metadata(files=rm_files)
@@ -341,10 +346,6 @@ def reset(self):
     def _get_unified_schema(
         self,
         verbose: bool = False,
-        # ts_unit: str | None = None,
-        # tz: str | None = None,
-        # use_large_string: bool = False,
-        # sort: bool | list[str] = False,
     ) -> tuple[pa.Schema, bool]:
         """
         Returns the unified schema for the dataset.
@@ -370,10 +371,14 @@ def _get_unified_schema(
             if self.has_metadata:
                 schemas.insert(0, self.metadata.schema.to_arrow_schema())
 
-            unified_schema = pa.unify_schemas(schemas, promote_options="permissive")
+            unified_schema = convert_large_types_to_normal(
+                pa.unify_schemas(schemas, promote_options="permissive")
+            )
             schemas_equal = all([unified_schema == schema for schema in schemas])
         else:
-            unified_schema = self.metadata.schema.to_arrow_schema()
+            unified_schema = convert_large_types_to_normal(
+                self.metadata.schema.to_arrow_schema()
+            )
             schemas_equal = True
         if verbose:
             logger.info(f"Schema is equal: {schemas_equal}")
@@ -386,8 +391,6 @@ def _repair_file_schemas(
         format_version: str | None = None,
         tz: str | None = None,
         ts_unit: str | None = None,
-        use_large_string: bool = False,
-        # sort: bool | list[str] = False,
         alter_schema: bool = True,
         verbose: bool = False,
         **kwargs,
@@ -402,8 +405,6 @@ def _repair_file_schemas(
                 the format version from the metadata will be used. Defaults to None.
             tz (str | None, optional): The timezone to use for repairing the files. Defaults to None.
             ts_unit (str | None, optional): The timestamp unit to use for repairing the files. Defaults to None.
-            use_large_string (bool, optional): Whether to use large string type for repairing the files.
-                Defaults to False.
             alter_schema (bool, optional): Whether to alter the schema of the files. Defaults to True.
             **kwargs: Additional keyword arguments to pass to the repair_schema function.
 
@@ -451,7 +452,6 @@ def _repair_file_schemas(
                 version=format_version,
                 ts_unit=ts_unit,
                 tz=tz,
-                use_large_string=use_large_string,
                 alter_schema=alter_schema,
                 **kwargs,
             )
@@ -495,8 +495,8 @@ def _update_metadata(self, reload: bool = False, verbose: bool = False, **kwargs
             (set(self.files_in_file_metadata) - set(self.files_in_metadata))
         )
         if verbose:
-            logger.info("Number of files to remove: ", len(rm_files))
-            logger.info("Number of files to add: ", len(new_files))
+            logger.info(f"Number of files to remove: {len(rm_files)}")
+            logger.info(f"Number of files to add: {len(new_files)}")
         if len(rm_files) or (len(new_files) and not self.has_metadata) or reload:
             if verbose:
                 logger.info("Updateing metadata: Rewrite metadata from file metadata")
@@ -507,12 +507,15 @@ def _update_metadata(self, reload: bool = False, verbose: bool = False, **kwargs
             for f in self.files_in_file_metadata[1:]:
                 self._metadata.append_row_groups(self._file_metadata[f])
 
-        else:
+        elif len(new_files):
             if verbose:
                 logger.info("Updateing metadata: Append new file metadata")
 
             for f in new_files:
                 self._metadata.append_row_groups(self.file_metadata[f])
+        else:
+            if verbose:
+                logger.info("Updateing metadata: No changes")
 
         self._write_metadata_file()
         self.load_files()
@@ -523,7 +526,6 @@ def update(
         schema: pa.Schema | None = None,
         ts_unit: str | None = None,
         tz: str | None = None,
-        use_large_string: bool = False,
         format_version: str | None = None,
         # sort: bool | list[str] = False,
         verbose: bool = False,
@@ -537,7 +539,6 @@ def update(
             schema (pa.Schema | None): The schema of the data source.
             ts_unit (str | None): The unit of the timestamp.
             tz (str | None): The time zone of the data source.
-            use_large_string (bool): Flag to indicate whether to use large string type.
             format_version (str | None): The version of the data format.
             **kwargs: Additional keyword arguments.
 
@@ -559,7 +560,6 @@ def update(
             format_version=format_version,
             tz=tz,
             ts_unit=ts_unit,
-            use_large_string=use_large_string,
             verbose=verbose,
             # sort=sort,
         )
diff --git a/pydala/schema.py b/pydala/schema.py
@@ -29,21 +29,57 @@ def sort_schema(schema: pa.Schema, names: list[str] | None = None) -> pa.Schema:
     )
 
 
-def shrink_large_string(schema: pa.Schema) -> pa.Schema:
-    """Convert all large_string types to string in a pyarrow.schema.
+# def shrink_large_string(schema: pa.Schema) -> pa.Schema:
+#     """Convert all large_string types to string in a pyarrow.schema.
 
-    Args:
-        schema (pa.Schema): pyarrow schema
+#     Args:
+#         schema (pa.Schema): pyarrow schema
 
-    Returns:
-        pa.Schema: converted pyarrow.schema
-    """
-    return pa.schema(
-        [
-            (n, pa.utf8()) if t == pa.large_string() else (n, t)
-            for n, t in list(zip(schema.names, schema.types))
-        ]
-    )
+#     Returns:
+#         pa.Schema: converted pyarrow.schema
+#     """
+#     return pa.schema(
+#         [
+#             (n, pa.utf8()) if t == pa.large_string() else (n, t)
+#             for n, t in list(zip(schema.names, schema.types))
+#         ]
+#     )
+
+
+def convert_large_types_to_normal(schema: pa.Schema) -> pa.Schema:
+    # Define mapping of large types to standard types
+    type_mapping = {
+        pa.large_string(): pa.string(),
+        pa.large_binary(): pa.binary(),
+        pa.large_list(pa.null()): pa.list_(pa.null()),
+    }
+
+    # Convert fields
+    new_fields = []
+    for field in schema:
+        field_type = field.type
+        # Check if type exists in mapping
+        if field_type in type_mapping:
+            new_field = pa.field(
+                name=field.name,
+                type=type_mapping[field_type],
+                nullable=field.nullable,
+                metadata=field.metadata,
+            )
+            new_fields.append(new_field)
+        # Handle large lists with nested types
+        elif isinstance(field_type, pa.LargeListType):
+            new_field = pa.field(
+                name=field.name,
+                type=pa.list_(field_type.value_type),
+                nullable=field.nullable,
+                metadata=field.metadata,
+            )
+            new_fields.append(new_field)
+        else:
+            new_fields.append(field)
+
+    return pa.schema(new_fields)
 
 
 def convert_timestamp(
@@ -449,7 +485,7 @@ def repair_schema(
     verbose: bool = True,
     ts_unit: str | None = None,  # "us",
     tz: str | None = None,
-    use_large_string: bool = False,
+    # use_large_types: bool = False,
     # sort: bool | list[str] = False,
     alter_schema: bool = True,
     **kwargs,
@@ -466,7 +502,6 @@ def repair_schema(
         verbose (bool, optional): Wheter to show the task progress using tqdm or not. Defaults to True.
         ts_unit (str|None): timestamp unit.
         tz (str|None): timezone for timestamp fields. Defaults to "UTC".
-        use_large_string (bool): Convert pyarrow.large_string() to pyarrow.string().
         **kwargs: Additional keyword arguments for pyarrow.parquet.write_table.
     """
     if files is None:
@@ -501,8 +536,8 @@ def repair_schema(
     if ts_unit is not None or tz is not None:
         schema = convert_timestamp(schema, unit=ts_unit, tz=tz)
 
-    if not use_large_string:
-        schema = shrink_large_string(schema)
+    # if not use_large_types:
+    schema = convert_large_types_to_normal(schema)
 
     schemas_equal = all([schema == schemas_ for schemas_ in file_schemas.values()])
 
diff --git a/pyproject.toml b/pyproject.toml