diff --git a/CHANGELOG.md b/CHANGELOG.md index e31a3ac6f..5436bc1db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ ## Release notes ### 0.13.6 -- Jun 13, 2022 -* Add - unified package level logger for package (#667) PR #1031 -* Update - swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031 +* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025 +* Add - Unified package level logger for package (#667) PR #1031 +* Update - Swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031 +* Bugfix - Fix query caching deleting non-datajoint files PR #1027 +* Update - Minimum Python version for Datajoint-Python is now 3.7 PR #1027 ### 0.13.5 -- May 19, 2022 * Update - Import ABC from collections.abc for Python 3.10 compatibility diff --git a/datajoint/external.py b/datajoint/external.py index c04cc40c4..265152cd4 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -1,6 +1,7 @@ from pathlib import Path, PurePosixPath, PureWindowsPath from collections.abc import Mapping from tqdm import tqdm +import logging from .settings import config from .errors import DataJointError, MissingExternalFile from .hash import uuid_from_buffer, uuid_from_file @@ -10,6 +11,8 @@ from . import s3 from .utils import safe_write, safe_copy +logger = logging.getLogger(__name__.split(".")[0]) + CACHE_SUBFOLDING = ( 2, 2, @@ -72,9 +75,7 @@ def definition(self): @property def table_name(self): - return "{external_table_root}_{store}".format( - external_table_root=EXTERNAL_TABLE_ROOT, store=self.store - ) + return f"{EXTERNAL_TABLE_ROOT}_{self.store}" @property def s3(self): @@ -276,9 +277,7 @@ def upload_filepath(self, local_filepath): # the tracking entry exists, check that it's the same file as before if contents_hash != check_hash[0]: raise DataJointError( - "A different version of '{file}' has already been placed.".format( - file=relative_filepath - ) + f"A different version of '{relative_filepath}' has already been placed." ) else: # upload the file and create its tracking entry @@ -304,27 +303,43 @@ def download_filepath(self, filepath_hash): :param filepath_hash: The hash (UUID) of the relative_path :return: hash (UUID) of the contents of the downloaded file or Nones """ + + def _need_checksum(local_filepath, expected_size): + limit = config.get("filepath_checksum_size_limit") + actual_size = Path(local_filepath).stat().st_size + if expected_size != actual_size: + # this should never happen without outside interference + raise DataJointError( + f"'{local_filepath}' downloaded but size did not match." + ) + return limit is None or actual_size < limit + if filepath_hash is not None: - relative_filepath, contents_hash = (self & {"hash": filepath_hash}).fetch1( - "filepath", "contents_hash" - ) + relative_filepath, contents_hash, size = ( + self & {"hash": filepath_hash} + ).fetch1("filepath", "contents_hash", "size") external_path = self._make_external_filepath(relative_filepath) local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath - file_exists = ( - Path(local_filepath).is_file() - and uuid_from_file(local_filepath) == contents_hash + + file_exists = Path(local_filepath).is_file() and ( + not _need_checksum(local_filepath, size) + or uuid_from_file(local_filepath) == contents_hash ) + if not file_exists: self._download_file(external_path, local_filepath) - checksum = uuid_from_file(local_filepath) if ( - checksum != contents_hash - ): # this should never happen without outside interference + _need_checksum(local_filepath, size) + and uuid_from_file(local_filepath) != contents_hash + ): + # this should never happen without outside interference raise DataJointError( - "'{file}' downloaded but did not pass checksum'".format( - file=local_filepath - ) + f"'{local_filepath}' downloaded but did not pass checksum." ) + if not _need_checksum(local_filepath, size): + logger.warning( + f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}" + ) return str(local_filepath), contents_hash # --- UTILITIES --- @@ -402,7 +417,7 @@ def delete( delete_external_files=None, limit=None, display_progress=True, - errors_as_string=True + errors_as_string=True, ): """ diff --git a/datajoint/settings.py b/datajoint/settings.py index e07642760..3da611af6 100644 --- a/datajoint/settings.py +++ b/datajoint/settings.py @@ -46,6 +46,7 @@ "display.show_tuple_count": True, "database.use_tls": None, "enable_python_native_blobs": True, # python-native/dj0 encoding support + "filepath_checksum_size_limit": None, # file size limit for when to disable checksums } ) diff --git a/docs-parts/intro/Releases_lang1.rst b/docs-parts/intro/Releases_lang1.rst index 95466ea2c..6a803eebe 100644 --- a/docs-parts/intro/Releases_lang1.rst +++ b/docs-parts/intro/Releases_lang1.rst @@ -1,7 +1,10 @@ 0.13.6 -- Jun 13, 2022 ---------------------- -* Add - unified package level logger for package (#667) PR #1031 -* Update - swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031 +* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025 +* Add - Unified package level logger for package (#667) PR #1031 +* Update - Swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031 +* Bugfix - Fix query caching deleting non-datajoint files PR #1027 +* Update - Minimum Python version for Datajoint-Python is now 3.7 PR #1027 0.13.5 -- May 19, 2022 ---------------------- diff --git a/tests/test_filepath.py b/tests/test_filepath.py index d0834b071..3e94e4885 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -3,8 +3,11 @@ import os from pathlib import Path import random - from .schema_external import schema, Filepath, FilepathS3, stores_config +import logging +import io + +logger = logging.getLogger("datajoint") def setUp(self): @@ -132,7 +135,9 @@ def test_duplicate_error_s3(): test_duplicate_error(store="repo-s3") -def test_filepath_class(table=Filepath(), store="repo"): +def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True): + if not verify_checksum: + dj.config["filepath_checksum_size_limit"] = 0 dj.errors._switch_filepath_types(True) stage_path = dj.config["stores"][store]["stage"] # create a mock file @@ -169,6 +174,7 @@ def test_filepath_class(table=Filepath(), store="repo"): # delete from external table table.external[store].delete(delete_external_files=True) dj.errors._switch_filepath_types(False) + dj.config["filepath_checksum_size_limit"] = None def test_filepath_class_again(): @@ -185,6 +191,24 @@ def test_filepath_class_s3_again(): test_filepath_class(FilepathS3(), "repo-s3") +def test_filepath_class_no_checksum(): + log_capture = io.StringIO() + stream_handler = logging.StreamHandler(log_capture) + log_format = logging.Formatter( + "[%(asctime)s][%(funcName)s][%(levelname)s]: %(message)s" + ) + stream_handler.setFormatter(log_format) + stream_handler.set_name("test_limit_warning") + logger.addHandler(stream_handler) + test_filepath_class(verify_checksum=False) + log_contents = log_capture.getvalue() + log_capture.close() + for handler in logger.handlers: # Clean up handler + if handler.name == "test_limit_warning": + logger.removeHandler(handler) + assert "Skipped checksum for file with hash:" in log_contents + + def test_filepath_cleanup(table=Filepath(), store="repo"): """test deletion of filepath entries from external table"""