Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,9 @@ def delete(self, delete_filter: Union[str, BooleanExpression], snapshot_properti
filtered_df = df.filter(preserve_row_filter)

# Only rewrite if there are records being deleted
if len(df) != len(filtered_df):
if len(filtered_df) == 0:
replaced_files.append((original_file.file, []))
elif len(df) != len(filtered_df):
Comment on lines +596 to +598
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: is it more readable if inlined?

if filtered_df and len(df) != len(filtered_df):

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I don't see much of a difference.

replaced_files.append((
original_file.file,
list(
Expand Down
27 changes: 26 additions & 1 deletion tests/integration/test_writes/test_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@
from pyiceberg.catalog.rest import RestCatalog
from pyiceberg.catalog.sql import SqlCatalog
from pyiceberg.exceptions import NoSuchTableError
from pyiceberg.expressions import In
from pyiceberg.io.pyarrow import _dataframe_to_data_files
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table import TableProperties
from pyiceberg.transforms import IdentityTransform
from pyiceberg.types import IntegerType, LongType, NestedField
from pyiceberg.types import IntegerType, LongType, NestedField, StringType
from utils import _create_table


Expand Down Expand Up @@ -1309,3 +1310,27 @@ def test_table_v1_with_null_nested_namespace(session_catalog: Catalog, arrow_tab

# We expect no error here
session_catalog.drop_table(identifier)


@pytest.mark.integration
def test_overwrite_all_data_with_filter(session_catalog: Catalog) -> None:
schema = Schema(
NestedField(1, "id", StringType(), required=True),
NestedField(2, "name", StringType(), required=False),
identifier_field_ids=[1],
)

data = pa.Table.from_pylist(
[
{"id": "1", "name": "Amsterdam"},
{"id": "2", "name": "San Francisco"},
{"id": "3", "name": "Drachten"},
],
schema=schema.as_arrow(),
)

identifier = "default.test_overwrite_all_data_with_filter"
tbl = _create_table(session_catalog, identifier, data=[data], schema=schema)
tbl.overwrite(data, In("id", ["1", "2", "3"]))

assert len(tbl.scan().to_arrow()) == 3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: since all data match the filter, the overwrite operation is a no-op, right? if so, can we assert that in the test? maybe show that the files are the same

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a no-op, it's deleting the whole file. The change is in the delete method, not in the overwrite method.

I believe that testing the behavior is enough.

Copy link
Contributor

@kevinjqliu kevinjqliu Aug 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, I see. The change is to make delete a no-op.
Sequence of operation

  • pass in `overwrite_filter which matches the entire table
  • in delete, the overwrite_filter is inversed, preserve_row_filter
  • use preserve_row_filter on data files.
  • if the result is empty, then we don't include this data file in deletion

Previously, we end up trying to write an empty data file.

Copy link
Contributor

@Fokko Fokko Aug 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes exactly. One thing to note is that it would be even more correct to add this to a DELETE snapshot, it is not replaced, but just dropped. Please note that most engines just use OVERWRITE.