1919 Any ,
2020 Dict ,
2121 List ,
22+ Mapping ,
2223 Optional ,
2324)
2425
2526from pydantic import Field , PrivateAttr , model_serializer
2627
2728from pyiceberg .io import FileIO
28- from pyiceberg .manifest import ManifestFile , read_manifest_list
29+ from pyiceberg .manifest import DataFile , DataFileContent , ManifestFile , read_manifest_list
2930from pyiceberg .typedef import IcebergBaseModel
3031
32+ ADDED_DATA_FILES = 'added-data-files'
33+ ADDED_DELETE_FILES = 'added-delete-files'
34+ ADDED_EQUALITY_DELETES = 'added-equality-deletes'
35+ ADDED_FILE_SIZE = 'added-files-size'
36+ ADDED_POSITION_DELETES = 'added-position-deletes'
37+ ADDED_POSITION_DELETE_FILES = 'added-position-delete-files'
38+ ADDED_RECORDS = 'added-records'
39+ DELETED_DATA_FILES = 'deleted-data-files'
40+ DELETED_RECORDS = 'deleted-records'
41+ ADDED_EQUALITY_DELETE_FILES = 'added-equality-delete-files'
42+ REMOVED_DELETE_FILES = 'removed-delete-files'
43+ REMOVED_EQUALITY_DELETES = 'removed-equality-deletes'
44+ REMOVED_EQUALITY_DELETE_FILES = 'removed-equality-delete-files'
45+ REMOVED_FILE_SIZE = 'removed-files-size'
46+ REMOVED_POSITION_DELETES = 'removed-position-deletes'
47+ REMOVED_POSITION_DELETE_FILES = 'removed-position-delete-files'
48+ TOTAL_EQUALITY_DELETES = 'total-equality-deletes'
49+ TOTAL_POSITION_DELETES = 'total-position-deletes'
50+ TOTAL_DATA_FILES = 'total-data-files'
51+ TOTAL_DELETE_FILES = 'total-delete-files'
52+ TOTAL_RECORDS = 'total-records'
53+ TOTAL_FILE_SIZE = 'total-files-size'
54+
55+
3156OPERATION = "operation"
3257
3358
@@ -51,7 +76,7 @@ def __repr__(self) -> str:
5176 return f"Operation.{ self .name } "
5277
5378
54- class Summary (IcebergBaseModel ):
79+ class Summary (IcebergBaseModel , Mapping [ str , str ] ):
5580 """A class that stores the summary information for a Snapshot.
5681
5782 The snapshot summary’s operation field is used by some operations,
@@ -65,6 +90,25 @@ def __init__(self, operation: Operation, **data: Any) -> None:
6590 super ().__init__ (operation = operation , ** data )
6691 self ._additional_properties = data
6792
93+ def __getitem__ (self , __key : str ) -> Optional [Any ]: # type: ignore
94+ """Return a key as it is a map."""
95+ if __key .lower () == 'operation' :
96+ return self .operation
97+ else :
98+ return self ._additional_properties .get (__key )
99+
100+ def __setitem__ (self , key : str , value : Any ) -> None :
101+ """Set a key as it is a map."""
102+ if key .lower () == 'operation' :
103+ self .operation = value
104+ else :
105+ self ._additional_properties [key ] = value
106+
107+ def __len__ (self ) -> int :
108+ """Return the number of keys in the summary."""
109+ # Operation is required
110+ return 1 + len (self ._additional_properties )
111+
68112 @model_serializer
69113 def ser_model (self ) -> Dict [str , str ]:
70114 return {
@@ -81,6 +125,14 @@ def __repr__(self) -> str:
81125 repr_properties = f", **{ repr (self ._additional_properties )} " if self ._additional_properties else ""
82126 return f"Summary({ repr (self .operation )} { repr_properties } )"
83127
128+ def __eq__ (self , other : Any ) -> bool :
129+ """Compare if the summary is equal to another summary."""
130+ return (
131+ self .operation == other .operation and self .additional_properties == other .additional_properties
132+ if isinstance (other , Summary )
133+ else False
134+ )
135+
84136
85137class Snapshot (IcebergBaseModel ):
86138 snapshot_id : int = Field (alias = "snapshot-id" )
@@ -116,3 +168,193 @@ class MetadataLogEntry(IcebergBaseModel):
116168class SnapshotLogEntry (IcebergBaseModel ):
117169 snapshot_id : int = Field (alias = "snapshot-id" )
118170 timestamp_ms : int = Field (alias = "timestamp-ms" )
171+
172+
173+ class SnapshotSummaryCollector :
174+ added_file_size : int
175+ removed_file_size : int
176+ added_data_files : int
177+ removed_data_files : int
178+ added_eq_delete_files : int
179+ removed_eq_delete_files : int
180+ added_pos_delete_files : int
181+ removed_pos_delete_files : int
182+ added_delete_files : int
183+ removed_delete_files : int
184+ added_records : int
185+ deleted_records : int
186+ added_pos_deletes : int
187+ removed_pos_deletes : int
188+ added_eq_deletes : int
189+ removed_eq_deletes : int
190+
191+ def __init__ (self ) -> None :
192+ self .added_file_size = 0
193+ self .removed_file_size = 0
194+ self .added_data_files = 0
195+ self .removed_data_files = 0
196+ self .added_eq_delete_files = 0
197+ self .removed_eq_delete_files = 0
198+ self .added_pos_delete_files = 0
199+ self .removed_pos_delete_files = 0
200+ self .added_delete_files = 0
201+ self .removed_delete_files = 0
202+ self .added_records = 0
203+ self .deleted_records = 0
204+ self .added_pos_deletes = 0
205+ self .removed_pos_deletes = 0
206+ self .added_eq_deletes = 0
207+ self .removed_eq_deletes = 0
208+
209+ def add_file (self , data_file : DataFile ) -> None :
210+ self .added_file_size += data_file .file_size_in_bytes
211+
212+ if data_file .content == DataFileContent .DATA :
213+ self .added_data_files += 1
214+ self .added_records += data_file .record_count
215+ elif data_file .content == DataFileContent .POSITION_DELETES :
216+ self .added_delete_files += 1
217+ self .added_pos_delete_files += 1
218+ self .added_pos_deletes += data_file .record_count
219+ elif data_file .content == DataFileContent .EQUALITY_DELETES :
220+ self .added_delete_files += 1
221+ self .added_eq_delete_files += 1
222+ self .added_eq_deletes += data_file .record_count
223+ else :
224+ raise ValueError (f"Unknown data file content: { data_file .content } " )
225+
226+ def remove_file (self , data_file : DataFile ) -> None :
227+ self .removed_file_size += data_file .file_size_in_bytes
228+
229+ if data_file .content == DataFileContent .DATA :
230+ self .removed_data_files += 1
231+ self .deleted_records += data_file .record_count
232+ elif data_file .content == DataFileContent .POSITION_DELETES :
233+ self .removed_delete_files += 1
234+ self .removed_pos_delete_files += 1
235+ self .removed_pos_deletes += data_file .record_count
236+ elif data_file .content == DataFileContent .EQUALITY_DELETES :
237+ self .removed_delete_files += 1
238+ self .removed_eq_delete_files += 1
239+ self .removed_eq_deletes += data_file .record_count
240+ else :
241+ raise ValueError (f"Unknown data file content: { data_file .content } " )
242+
243+ def build (self ) -> Dict [str , str ]:
244+ def set_when_positive (properties : Dict [str , str ], num : int , property_name : str ) -> None :
245+ if num > 0 :
246+ properties [property_name ] = str (num )
247+
248+ properties : Dict [str , str ] = {}
249+ set_when_positive (properties , self .added_file_size , ADDED_FILE_SIZE )
250+ set_when_positive (properties , self .removed_file_size , REMOVED_FILE_SIZE )
251+ set_when_positive (properties , self .added_data_files , ADDED_DATA_FILES )
252+ set_when_positive (properties , self .removed_data_files , DELETED_DATA_FILES )
253+ set_when_positive (properties , self .added_eq_delete_files , ADDED_EQUALITY_DELETE_FILES )
254+ set_when_positive (properties , self .removed_eq_delete_files , REMOVED_EQUALITY_DELETE_FILES )
255+ set_when_positive (properties , self .added_pos_delete_files , ADDED_POSITION_DELETE_FILES )
256+ set_when_positive (properties , self .removed_pos_delete_files , REMOVED_POSITION_DELETE_FILES )
257+ set_when_positive (properties , self .added_delete_files , ADDED_DELETE_FILES )
258+ set_when_positive (properties , self .removed_delete_files , REMOVED_DELETE_FILES )
259+ set_when_positive (properties , self .added_records , ADDED_RECORDS )
260+ set_when_positive (properties , self .deleted_records , DELETED_RECORDS )
261+ set_when_positive (properties , self .added_pos_deletes , ADDED_POSITION_DELETES )
262+ set_when_positive (properties , self .removed_pos_deletes , REMOVED_POSITION_DELETES )
263+ set_when_positive (properties , self .added_eq_deletes , ADDED_EQUALITY_DELETES )
264+ set_when_positive (properties , self .removed_eq_deletes , REMOVED_EQUALITY_DELETES )
265+
266+ return properties
267+
268+
269+ def _truncate_table_summary (summary : Summary , previous_summary : Mapping [str , str ]) -> Summary :
270+ for prop in {
271+ TOTAL_DATA_FILES ,
272+ TOTAL_DELETE_FILES ,
273+ TOTAL_RECORDS ,
274+ TOTAL_FILE_SIZE ,
275+ TOTAL_POSITION_DELETES ,
276+ TOTAL_EQUALITY_DELETES ,
277+ }:
278+ summary [prop ] = '0'
279+
280+ if value := previous_summary .get (TOTAL_DATA_FILES ):
281+ summary [DELETED_DATA_FILES ] = value
282+ if value := previous_summary .get (TOTAL_DELETE_FILES ):
283+ summary [REMOVED_DELETE_FILES ] = value
284+ if value := previous_summary .get (TOTAL_RECORDS ):
285+ summary [DELETED_RECORDS ] = value
286+ if value := previous_summary .get (TOTAL_FILE_SIZE ):
287+ summary [REMOVED_FILE_SIZE ] = value
288+ if value := previous_summary .get (TOTAL_POSITION_DELETES ):
289+ summary [REMOVED_POSITION_DELETES ] = value
290+ if value := previous_summary .get (TOTAL_EQUALITY_DELETES ):
291+ summary [REMOVED_EQUALITY_DELETES ] = value
292+
293+ return summary
294+
295+
296+ def _update_snapshot_summaries (
297+ summary : Summary , previous_summary : Optional [Mapping [str , str ]] = None , truncate_full_table : bool = False
298+ ) -> Summary :
299+ if summary .operation not in {Operation .APPEND , Operation .OVERWRITE }:
300+ raise ValueError (f"Operation not implemented: { summary .operation } " )
301+
302+ if truncate_full_table and summary .operation == Operation .OVERWRITE and previous_summary is not None :
303+ summary = _truncate_table_summary (summary , previous_summary )
304+
305+ if not previous_summary :
306+ previous_summary = {
307+ TOTAL_DATA_FILES : '0' ,
308+ TOTAL_DELETE_FILES : '0' ,
309+ TOTAL_RECORDS : '0' ,
310+ TOTAL_FILE_SIZE : '0' ,
311+ TOTAL_POSITION_DELETES : '0' ,
312+ TOTAL_EQUALITY_DELETES : '0' ,
313+ }
314+
315+ def _update_totals (total_property : str , added_property : str , removed_property : str ) -> None :
316+ if previous_total_str := previous_summary .get (total_property ):
317+ try :
318+ new_total = int (previous_total_str )
319+ if new_total >= 0 and (added := summary .get (added_property )):
320+ new_total += int (added )
321+ if new_total >= 0 and (removed := summary .get (removed_property )):
322+ new_total -= int (removed )
323+ except ValueError as e :
324+ raise ValueError (f"Could not parse summary property { total_property } to an int: { previous_total_str } " ) from e
325+
326+ if new_total >= 0 :
327+ summary [total_property ] = str (new_total )
328+
329+ _update_totals (
330+ total_property = TOTAL_DATA_FILES ,
331+ added_property = ADDED_DATA_FILES ,
332+ removed_property = DELETED_DATA_FILES ,
333+ )
334+ _update_totals (
335+ total_property = TOTAL_DELETE_FILES ,
336+ added_property = ADDED_DELETE_FILES ,
337+ removed_property = REMOVED_DELETE_FILES ,
338+ )
339+ _update_totals (
340+ total_property = TOTAL_RECORDS ,
341+ added_property = ADDED_RECORDS ,
342+ removed_property = DELETED_RECORDS ,
343+ )
344+ _update_totals (
345+ total_property = TOTAL_FILE_SIZE ,
346+ added_property = ADDED_FILE_SIZE ,
347+ removed_property = REMOVED_FILE_SIZE ,
348+ )
349+ _update_totals (
350+ total_property = TOTAL_POSITION_DELETES ,
351+ added_property = ADDED_POSITION_DELETES ,
352+ removed_property = REMOVED_POSITION_DELETES ,
353+ )
354+ _update_totals (
355+ total_property = TOTAL_EQUALITY_DELETES ,
356+ added_property = ADDED_EQUALITY_DELETES ,
357+ removed_property = REMOVED_EQUALITY_DELETES ,
358+ )
359+
360+ return summary
0 commit comments