diff --git a/src/zimscraperlib/inputs.py b/src/zimscraperlib/inputs.py index 20326537..c6c1efd0 100644 --- a/src/zimscraperlib/inputs.py +++ b/src/zimscraperlib/inputs.py @@ -6,6 +6,7 @@ import pathlib import shutil import tempfile +from collections.abc import Iterable from zimscraperlib import logger from zimscraperlib.constants import DEFAULT_USER_AGENT @@ -111,3 +112,26 @@ def compute_descriptions( user_description = user_description[:-1] + "…" return (user_description, user_long_description) + + +def compute_tags( + default_tags: Iterable[str], + user_tags: str | None, +) -> set[str]: + """Computes a list of tags string compliant with ZIM standard. + + Based on default tags (set by the scraper) and user provided tags (usually retrived + from the CLI arguments), the function computes a tag string to be used as metadata + which is compliant with the ZIM standard. It removes duplicates and empty values, + and strip leading and trailing whitespaces. + + args: + default_tags: the list of default tags always set for a given scraper + user_tags: the tags, separated by semi-colon, as given by user at CLI args + + Returns a set of tags, ready to be passed to the creator + """ + + return { + tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag + } diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index f96f2ec3..a364fc6b 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -103,6 +103,17 @@ def validate_tags(name: str, value: Iterable[str] | str): or not all(isinstance(tag, str) for tag in value) ): raise ValueError(f"Invalid type(s) for {name}") + if ( + name == "Tags" + and not isinstance(value, str) + and isinstance(value, Iterable) + and len(set(value)) != len(list(value)) + ): + raise ValueError(f"Duplicate tags are not valid: {value}") + if name == "Tags" and isinstance(value, str): + values = value.split(";") + if len(set(values)) != len(list(values)): + raise ValueError(f"Duplicate tags are not valid: {value}") def validate_illustrations(name: str, value: bytes): diff --git a/tests/download/test_download.py b/tests/download/test_download.py index 2ad44480..9d82a72a 100644 --- a/tests/download/test_download.py +++ b/tests/download/test_download.py @@ -167,8 +167,8 @@ def test_large_download_https(tmp_path, valid_https_url): @pytest.mark.parametrize( "url,video_id", [ - ("https://vimeo.com/619427082", "619427082"), - ("https://vimeo.com/619427082", "619427082"), + ("https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", "tyekuoPZqb7BtkyNPwVHJL"), + ("https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", "tyekuoPZqb7BtkyNPwVHJL"), ], ) def test_youtube_download_serial(url, video_id, tmp_path): @@ -186,7 +186,7 @@ def test_youtube_download_serial(url, video_id, tmp_path): def test_youtube_download_nowait(tmp_path): with YoutubeDownloader(threads=1) as yt_downloader: future = yt_downloader.download( - "https://vimeo.com/619427082", + "https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", BestMp4.get_options(target_dir=tmp_path), wait=False, ) @@ -212,10 +212,11 @@ def test_youtube_download_error(): def test_youtube_download_contextmanager(tmp_path): with YoutubeDownloader(threads=1) as yt_downloader: yt_downloader.download( - "https://vimeo.com/619427082", BestWebm.get_options(target_dir=tmp_path) + "https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", + BestWebm.get_options(target_dir=tmp_path), ) assert yt_downloader.executor._shutdown - assert tmp_path.joinpath("video.mp4").exists() # videmo doesn't offer webm + assert tmp_path.joinpath("video.mp4").exists() # jeena doesn't offer webm @pytest.fixture diff --git a/tests/inputs/test_inputs.py b/tests/inputs/test_inputs.py index c0df932e..fd8a2cda 100644 --- a/tests/inputs/test_inputs.py +++ b/tests/inputs/test_inputs.py @@ -16,7 +16,11 @@ MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH, ) from zimscraperlib.constants import NAME as PROJECT_NAME -from zimscraperlib.inputs import compute_descriptions, handle_user_provided_file +from zimscraperlib.inputs import ( + compute_descriptions, + compute_tags, + handle_user_provided_file, +) def test_with_none(): @@ -296,3 +300,28 @@ def test_description( assert description == expected_description assert long_description == expected_long_description + + +@pytest.mark.parametrize( + "default_tags, user_tags, expected_tags", + [ + pytest.param( + {"tag1", "tag2"}, + "tag3;tag4", + {"tag1", "tag2", "tag3", "tag4"}, + id="case1", + ), + pytest.param( + {" tag1", " tag2 "}, + " ta:g,4;tag2 ", + {"tag1", "tag2", "ta:g,4"}, + id="case2", + ), + ], +) +def test_compute_tags( + default_tags: set[str], + user_tags: str, + expected_tags: set[str], +): + assert compute_tags(default_tags, user_tags) == expected_tags diff --git a/tests/zim/test_metadata.py b/tests/zim/test_metadata.py index 0c0d153e..fce742ea 100644 --- a/tests/zim/test_metadata.py +++ b/tests/zim/test_metadata.py @@ -32,3 +32,32 @@ def test_validate_language_valid(name: str, value: Iterable[str] | str): def test_validate_language_invalid(name: str, value: Iterable[str] | str): with pytest.raises(ValueError, match=re.escape("is not ISO-639-3")): metadata.validate_language(name, value) + + +@pytest.mark.parametrize( + "tags, is_valid", + [ + pytest.param("", True, id="empty_string"), + pytest.param("tag1", True, id="empty_string"), + pytest.param("taaaag1", True, id="many_letters"), + pytest.param("tag1;tag2", True, id="semi_colon_distinct_1"), + pytest.param("tag2;tag2", False, id="semi_colon_identical"), + pytest.param("tag,1;tug,1", True, id="semi_colon_distinct_2"), + pytest.param( + "tag1,tag2", True, id="comma" + ), # we cannot say that this ought to be a tags separator + pytest.param({"tag1"}, True, id="one_tag_in_set"), + pytest.param({"tag1", "tag2"}, True, id="two_tags_in_set"), + pytest.param(1, False, id="one_int"), + pytest.param(None, False, id="none_value"), + pytest.param(["tag1", "tag2"], True, id="two_distinct"), + pytest.param(["tag1", "tag1"], False, id="two_identical"), + pytest.param(["tag1", 1], False, id="int_in_list"), + ], +) +def test_validate_tags(tags, is_valid): + if is_valid: + metadata.validate_tags("Tags", tags) + else: + with pytest.raises(ValueError): + metadata.validate_tags("Tags", tags)