apache · HonahX · Jul 17, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 13, 2024
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -120,6 +120,7 @@
     Schema,
     SchemaVisitorPerPrimitiveType,
     SchemaWithPartnerVisitor,
+    _check_schema_compatible,
     pre_order_visit,
     promote,
     prune_columns,
@@ -1450,14 +1451,17 @@ def field_partner(self, partner_struct: Optional[pa.Array], field_id: int, _: st
             except ValueError:
                 return None
 
-            if isinstance(partner_struct, pa.StructArray):
-                return partner_struct.field(name)
-            elif isinstance(partner_struct, pa.Table):
-                return partner_struct.column(name).combine_chunks()
-            elif isinstance(partner_struct, pa.RecordBatch):
-                return partner_struct.column(name)
-            else:
-                raise ValueError(f"Cannot find {name} in expected partner_struct type {type(partner_struct)}")
+            try:
+                if isinstance(partner_struct, pa.StructArray):
+                    return partner_struct.field(name)
+                elif isinstance(partner_struct, pa.Table):
+                    return partner_struct.column(name).combine_chunks()
+                elif isinstance(partner_struct, pa.RecordBatch):
+                    return partner_struct.column(name)
+                else:
+                    raise ValueError(f"Cannot find {name} in expected partner_struct type {type(partner_struct)}")
+            except KeyError:
 if field_array is not None: 
     array = self._cast_if_needed(field, field_array) 
     field_arrays.append(array) 
     fields.append(self._construct_field(field, array.type)) 
 elif field.optional: 
     arrow_type = schema_to_pyarrow(field.field_type, include_field_ids=False) 
     field_arrays.append(pa.nulls(len(struct_array), type=arrow_type)) 
     fields.append(self._construct_field(field, arrow_type)) 
 if field_array is not None: 
     array = self._cast_if_needed(field, field_array) 
     field_arrays.append(array) 
     fields.append(self._construct_field(field, array.type)) 
 elif field.optional: 
     arrow_type = schema_to_pyarrow(field.field_type, include_field_ids=False) 
     field_arrays.append(pa.nulls(len(struct_array), type=arrow_type)) 
     fields.append(self._construct_field(field, arrow_type)) 
+                return None
 
         return None
 
@@ -1998,8 +2002,7 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT
     )
 
     def write_parquet(task: WriteTask) -> DataFile:
-        table_schema = task.schema
-
+        table_schema = table_metadata.schema()
         # if schema needs to be transformed, use the transformed schema and adjust the arrow table accordingly
         # otherwise use the original schema
         if (sanitized_schema := sanitize_column_names(table_schema)) != table_schema:
@@ -2011,7 +2014,7 @@ def write_parquet(task: WriteTask) -> DataFile:
         batches = [
             _to_requested_schema(
                 requested_schema=file_schema,
-                file_schema=table_schema,
+                file_schema=task.schema,
                 batch=batch,
                 downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
                 include_field_ids=True,
@@ -2070,47 +2073,30 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[
     return bin_packed_record_batches
 
 
-def _check_schema_compatible(table_schema: Schema, other_schema: pa.Schema, downcast_ns_timestamp_to_us: bool = False) -> None:
+def _check_pyarrow_schema_compatible(
+    requested_schema: Schema, provided_schema: pa.Schema, downcast_ns_timestamp_to_us: bool = False
+) -> None:
     """
-    Check if the `table_schema` is compatible with `other_schema`.
+    Check if the `requested_schema` is compatible with `provided_schema`.
 
     Two schemas are considered compatible when they are equal in terms of the Iceberg Schema type.
 
     Raises:
         ValueError: If the schemas are not compatible.
     """
-    name_mapping = table_schema.name_mapping
+    name_mapping = requested_schema.name_mapping
     try:
-        task_schema = pyarrow_to_schema(
-            other_schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
+        provided_schema = pyarrow_to_schema(
+            provided_schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
         )
     except ValueError as e:
-        other_schema = _pyarrow_to_schema_without_ids(other_schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)
-        additional_names = set(other_schema.column_names) - set(table_schema.column_names)
+        provided_schema = _pyarrow_to_schema_without_ids(provided_schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)
+        additional_names = provided_schema.field_names - requested_schema.field_names
         raise ValueError(
             f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. Update the schema first (hint, use union_by_name)."
         ) from e
 
-    if table_schema.as_struct() != task_schema.as_struct():
-        from rich.console import Console
-        from rich.table import Table as RichTable
-
-        console = Console(record=True)
-
-        rich_table = RichTable(show_header=True, header_style="bold")
-        rich_table.add_column("")
-        rich_table.add_column("Table field")
-        rich_table.add_column("Dataframe field")
-
-        for lhs in table_schema.fields:
-            try:
-                rhs = task_schema.find_field(lhs.field_id)
-                rich_table.add_row("✅" if lhs == rhs else "❌", str(lhs), str(rhs))
-            except ValueError:
-                rich_table.add_row("❌", str(lhs), "Missing")
-
-        console.print(rich_table)
-        raise ValueError(f"Mismatch in fields:\n{console.export_text()}")
+    _check_schema_compatible(requested_schema, provided_schema)
 
 
 def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_paths: Iterator[str]) -> Iterator[DataFile]:
@@ -2124,7 +2110,7 @@ def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_
                 f"Cannot add file {file_path} because it has field IDs. `add_files` only supports addition of files without field_ids"
             )
         schema = table_metadata.schema()
-        _check_schema_compatible(schema, parquet_metadata.schema.to_arrow_schema())
+        _check_pyarrow_schema_compatible(schema, parquet_metadata.schema.to_arrow_schema())
 
         statistics = data_file_statistics_from_parquet_metadata(
             parquet_metadata=parquet_metadata,
@@ -2205,7 +2191,7 @@ def _dataframe_to_data_files(
     Returns:
         An iterable that supplies datafiles that represent the table.
     """
-    from pyiceberg.table import PropertyUtil, TableProperties, WriteTask
+    from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, PropertyUtil, TableProperties, WriteTask
 
     counter = counter or itertools.count(0)
     write_uuid = write_uuid or uuid.uuid4()
@@ -2214,13 +2200,16 @@ def _dataframe_to_data_files(
         property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
         default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT,
     )
+    name_mapping = table_metadata.schema().name_mapping
+    downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
+    task_schema = pyarrow_to_schema(df.schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)
 
     if table_metadata.spec().is_unpartitioned():
         yield from write_file(
             io=io,
             table_metadata=table_metadata,
             tasks=iter([
-                WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=table_metadata.schema())
+                WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=task_schema)
                 for batches in bin_pack_arrow_table(df, target_file_size)
             ]),
         )
@@ -2235,7 +2224,7 @@ def _dataframe_to_data_files(
                     task_id=next(counter),
                     record_batches=batches,
                     partition_key=partition.partition_key,
-                    schema=table_metadata.schema(),
+                    schema=task_schema,
                 )
                 for partition in partitions
                 for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size)

diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py
@@ -324,6 +324,11 @@ def field_ids(self) -> Set[int]:
         """Return the IDs of the current schema."""
         return set(self._name_to_id.values())
 
+    @property
+    def field_names(self) -> Set[str]:
+        """Return the Names of the current schema."""
+        return set(self._name_to_id.keys())
+
     def _validate_identifier_field(self, field_id: int) -> None:
         """Validate that the field with the given ID is a valid identifier field.
 
@@ -1616,3 +1621,145 @@ def _(file_type: FixedType, read_type: IcebergType) -> IcebergType:
         return read_type
     else:
         raise ResolveError(f"Cannot promote {file_type} to {read_type}")
+
+
+def _check_schema_compatible(requested_schema: Schema, provided_schema: Schema) -> None:
+    """
+    Check if the `provided_schema` is compatible with `requested_schema`.
+
+    Both Schemas must have valid IDs and share the same ID for the same field names.
+
+    Two schemas are considered compatible when:
+    1. All `required` fields in `requested_schema` are present and are also `required` in the `provided_schema`
+    2. Field Types are consistent for fields that are present in both schemas. I.e. the field type
+       in the `provided_schema` can be promoted to the field type of the same field ID in `requested_schema`
+
+    Raises:
+        ValueError: If the schemas are not compatible.
+    """
+    visit(requested_schema, _SchemaCompatibilityVisitor(provided_schema))
+
+    # from rich.console import Console
+    # from rich.table import Table as RichTable
+
+    # console = Console(record=True)
+
+    # rich_table = RichTable(show_header=True, header_style="bold")
+    # rich_table.add_column("")
+    # rich_table.add_column("Table field")
+    # rich_table.add_column("Dataframe field")
+
+    # is_compatible = True
+
+    # for field_id in requested_schema.field_ids:
+    #     lhs = requested_schema.find_field(field_id)
+    #     try:
+    #         rhs = provided_schema.find_field(field_id)
+    #     except ValueError:
+    #         if lhs.required:
+    #             rich_table.add_row("❌", str(lhs), "Missing")
+    #             is_compatible = False
+    #         else:
+    #             rich_table.add_row("✅", str(lhs), "Missing")
+    #         continue
+
+    #     if lhs.required and not rhs.required:
+    #         rich_table.add_row("❌", str(lhs), "Missing")
+    #         is_compatible = False
+
+    #     if lhs.field_type == rhs.field_type:
+    #         rich_table.add_row("✅", str(lhs), str(rhs))
+    #         continue
+    #     elif any(
+    #         (isinstance(lhs.field_type, container_type) and isinstance(rhs.field_type, container_type))
+    #         for container_type in {StructType, MapType, ListType}
+    #     ):
+    #         rich_table.add_row("✅", str(lhs), str(rhs))
+    #         continue
+    #     else:
+    #         try:
+    #             promote(rhs.field_type, lhs.field_type)
+    #             rich_table.add_row("✅", str(lhs), str(rhs))
+    #         except ResolveError:
+    #             rich_table.add_row("❌", str(lhs), str(rhs))
+    #             is_compatible = False
+
+    # if not is_compatible:
+    #     console.print(rich_table)
+    #     raise ValueError(f"Mismatch in fields:\n{console.export_text()}")
+
+
+class _SchemaCompatibilityVisitor(SchemaVisitor[bool]):
+    provided_schema: Schema
+
+    def __init__(self, provided_schema: Schema):
+        from rich.console import Console
+        from rich.table import Table as RichTable
+
+        self.provided_schema = provided_schema
+        self.rich_table = RichTable(show_header=True, header_style="bold")
+        self.rich_table.add_column("")
+        self.rich_table.add_column("Table field")
+        self.rich_table.add_column("Dataframe field")
+        self.console = Console(record=True)
+
+    def _is_field_compatible(self, lhs: NestedField) -> bool:
+        # Check required field exists as required field first
+        try:
+            rhs = self.provided_schema.find_field(lhs.field_id)
+        except ValueError:
+            if lhs.required:
+                self.rich_table.add_row("❌", str(lhs), "Missing")
+                return False
+            else:
+                self.rich_table.add_row("✅", str(lhs), "Missing")
+                return True
+
+        if lhs.required and not rhs.required:
+            self.rich_table.add_row("❌", str(lhs), "Missing")
+            return False
+
+        # Check type compatibility
+        if lhs.field_type == rhs.field_type:
+            self.rich_table.add_row("✅", str(lhs), str(rhs))
+            return True
+        elif any(
+            (isinstance(lhs.field_type, container_type) and isinstance(rhs.field_type, container_type))
+            for container_type in {StructType, MapType, ListType}
+        ):
+            self.rich_table.add_row("✅", str(lhs), str(rhs))
+            return True
+        else:
+            try:
+                promote(rhs.field_type, lhs.field_type)
 expected_physical_type = _primitive_to_physical(iceberg_type) 
 if expected_physical_type != physical_type_string: 
     raise ValueError( 
         f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}" 
     ) 
 expected_physical_type = _primitive_to_physical(iceberg_type) 
 if expected_physical_type != physical_type_string: 
     raise ValueError( 
         f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}" 
     ) 
+                self.rich_table.add_row("✅", str(lhs), str(rhs))
+                return True
+            except ResolveError:
+                self.rich_table.add_row("❌", str(lhs), str(rhs))
+                return False
+
+    def schema(self, schema: Schema, struct_result: bool) -> bool:
+        if not struct_result:
+            self.console.print(self.rich_table)
+            raise ValueError(f"Mismatch in fields:\n{self.console.export_text()}")
+        return struct_result
+
+    def struct(self, struct: StructType, field_results: List[bool]) -> bool:
+        return all(field_results)
+
+    def field(self, field: NestedField, field_result: bool) -> bool:
+        return all([self._is_field_compatible(field), field_result])
+
+    def list(self, list_type: ListType, element_result: bool) -> bool:
+        return element_result and self._is_field_compatible(list_type.element_field)
+
+    def map(self, map_type: MapType, key_result: bool, value_result: bool) -> bool:
+        return all([
+            self._is_field_compatible(map_type.key_field),
+            self._is_field_compatible(map_type.value_field),
+            key_result,
+            value_result,
+        ])
+
+    def primitive(self, primitive: PrimitiveType) -> bool:
+        return True
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -73,7 +73,6 @@
     manifest_evaluator,
 )
 from pyiceberg.io import FileIO, OutputFile, load_file_io
-from pyiceberg.io.pyarrow import _check_schema_compatible, _dataframe_to_data_files, expression_to_pyarrow, project_table
 from pyiceberg.manifest import (
     POSITIONAL_DELETE_SCHEMA,
     DataFile,
@@ -471,6 +470,8 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
         except ModuleNotFoundError as e:
             raise ModuleNotFoundError("For writes PyArrow needs to be installed") from e
 
+        from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, _dataframe_to_data_files
+
         if not isinstance(df, pa.Table):
             raise ValueError(f"Expected PyArrow table, got: {df}")
 
@@ -481,8 +482,8 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
                 f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}."
             )
         downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
-        _check_schema_compatible(
-            self._table.schema(), other_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
+        _check_pyarrow_schema_compatible(
+            self._table.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
         )
 
         manifest_merge_enabled = PropertyUtil.property_as_bool(
@@ -528,6 +529,8 @@ def overwrite(
         except ModuleNotFoundError as e:
             raise ModuleNotFoundError("For writes PyArrow needs to be installed") from e
 
+        from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, _dataframe_to_data_files
+
         if not isinstance(df, pa.Table):
             raise ValueError(f"Expected PyArrow table, got: {df}")
 
@@ -538,8 +541,8 @@ def overwrite(
                 f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}."
             )
         downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
-        _check_schema_compatible(
-            self._table.schema(), other_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
+        _check_pyarrow_schema_compatible(
+            self._table.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
         )
 
         self.delete(delete_filter=overwrite_filter, snapshot_properties=snapshot_properties)
@@ -566,6 +569,8 @@ def delete(self, delete_filter: Union[str, BooleanExpression], snapshot_properti
             delete_filter: A boolean expression to delete rows from a table
             snapshot_properties: Custom properties to be added to the snapshot summary
         """
+        from pyiceberg.io.pyarrow import _dataframe_to_data_files, expression_to_pyarrow, project_table
+
         if (
             self.table_metadata.properties.get(TableProperties.DELETE_MODE, TableProperties.DELETE_MODE_DEFAULT)
             == TableProperties.DELETE_MODE_MERGE_ON_READ

diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py
@@ -501,14 +501,11 @@ def test_add_files_fails_on_schema_mismatch(spark: SparkSession, session_catalog
             )
 
     expected = """Mismatch in fields:
-┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
-┃    ┃ Table field              ┃ Dataframe field          ┃
-┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩
-│ ✅ │ 1: foo: optional boolean │ 1: foo: optional boolean │
-| ✅ │ 2: bar: optional string  │ 2: bar: optional string  │
-│ ❌ │ 3: baz: optional int     │ 3: baz: optional string  │
-│ ✅ │ 4: qux: optional date    │ 4: qux: optional date    │
-└────┴──────────────────────────┴──────────────────────────┘
+┏━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
+┃ Field Name ┃ Category ┃ Table field  ┃ Dataframe field ┃
+┡━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
+│ baz        │ Type     │ optional int │ optional string │
+└────────────┴──────────┴──────────────┴─────────────────┘
 """
 
     with pytest.raises(ValueError, match=expected):