openzim · benoit74 · Jul 4, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/src/zimscraperlib/inputs.py b/src/zimscraperlib/inputs.py
@@ -6,6 +6,7 @@
 import pathlib
 import shutil
 import tempfile
+from collections.abc import Iterable
 
 from zimscraperlib import logger
 from zimscraperlib.constants import DEFAULT_USER_AGENT
@@ -111,3 +112,26 @@ def compute_descriptions(
             user_description = user_description[:-1] + "…"
 
     return (user_description, user_long_description)
+
+
+def compute_tags(
+    default_tags: Iterable[str],
+    user_tags: str | None,
+) -> set[str]:
+    """Computes a list of tags string compliant with ZIM standard.
+
+    Based on default tags (set by the scraper) and user provided tags (usually retrived
+    from the CLI arguments), the function computes a tag string to be used as metadata
+    which is compliant with the ZIM standard. It removes duplicates and empty values,
+    and strip leading and trailing whitespaces.
+
+    args:
+        default_tags: the list of default tags always set for a given scraper
+        user_tags:    the tags, separated by semi-colon, as given by user at CLI args
+
+    Returns a set of tags, ready to be passed to the creator
+    """
+
+    return {
+        tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag
+    }
diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py
@@ -103,6 +103,17 @@ def validate_tags(name: str, value: Iterable[str] | str):
         or not all(isinstance(tag, str) for tag in value)
     ):
         raise ValueError(f"Invalid type(s) for {name}")
+    if (
+        name == "Tags"
+        and not isinstance(value, str)
+        and isinstance(value, Iterable)
+        and len(set(value)) != len(list(value))
+    ):
+        raise ValueError(f"Duplicate tags are not valid: {value}")
+    if name == "Tags" and isinstance(value, str):
+        values = value.split(";")
+        if len(set(values)) != len(list(values)):
+            raise ValueError(f"Duplicate tags are not valid: {value}")
 
 
 def validate_illustrations(name: str, value: bytes):

diff --git a/tests/download/test_download.py b/tests/download/test_download.py
@@ -167,8 +167,8 @@ def test_large_download_https(tmp_path, valid_https_url):
 @pytest.mark.parametrize(
     "url,video_id",
     [
-        ("https://vimeo.com/619427082", "619427082"),
-        ("https://vimeo.com/619427082", "619427082"),
+        ("https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", "tyekuoPZqb7BtkyNPwVHJL"),
+        ("https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", "tyekuoPZqb7BtkyNPwVHJL"),
     ],
 )
 def test_youtube_download_serial(url, video_id, tmp_path):
@@ -186,7 +186,7 @@ def test_youtube_download_serial(url, video_id, tmp_path):
 def test_youtube_download_nowait(tmp_path):
     with YoutubeDownloader(threads=1) as yt_downloader:
         future = yt_downloader.download(
-            "https://vimeo.com/619427082",
+            "https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL",
             BestMp4.get_options(target_dir=tmp_path),
             wait=False,
         )
@@ -212,10 +212,11 @@ def test_youtube_download_error():
 def test_youtube_download_contextmanager(tmp_path):
     with YoutubeDownloader(threads=1) as yt_downloader:
         yt_downloader.download(
-            "https://vimeo.com/619427082", BestWebm.get_options(target_dir=tmp_path)
+            "https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL",
+            BestWebm.get_options(target_dir=tmp_path),
         )
     assert yt_downloader.executor._shutdown
-    assert tmp_path.joinpath("video.mp4").exists()  # videmo doesn't offer webm
+    assert tmp_path.joinpath("video.mp4").exists()  # jeena doesn't offer webm
 
 
 @pytest.fixture

diff --git a/tests/inputs/test_inputs.py b/tests/inputs/test_inputs.py
@@ -16,7 +16,11 @@
     MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH,
 )
 from zimscraperlib.constants import NAME as PROJECT_NAME
-from zimscraperlib.inputs import compute_descriptions, handle_user_provided_file
+from zimscraperlib.inputs import (
+    compute_descriptions,
+    compute_tags,
+    handle_user_provided_file,
+)
 
 
 def test_with_none():
@@ -296,3 +300,28 @@ def test_description(
 
     assert description == expected_description
     assert long_description == expected_long_description
+
+
+@pytest.mark.parametrize(
+    "default_tags, user_tags, expected_tags",
+    [
+        pytest.param(
+            {"tag1", "tag2"},
+            "tag3;tag4",
+            {"tag1", "tag2", "tag3", "tag4"},
+            id="case1",
+        ),
+        pytest.param(
+            {" tag1", "  tag2 "},
+            " ta:g,4;tag2 ",
+            {"tag1", "tag2", "ta:g,4"},
+            id="case2",
+        ),
+    ],
+)
+def test_compute_tags(
+    default_tags: set[str],
+    user_tags: str,
+    expected_tags: set[str],
+):
+    assert compute_tags(default_tags, user_tags) == expected_tags
diff --git a/tests/zim/test_metadata.py b/tests/zim/test_metadata.py
@@ -32,3 +32,32 @@ def test_validate_language_valid(name: str, value: Iterable[str] | str):
 def test_validate_language_invalid(name: str, value: Iterable[str] | str):
     with pytest.raises(ValueError, match=re.escape("is not ISO-639-3")):
         metadata.validate_language(name, value)
+
+
+@pytest.mark.parametrize(
+    "tags, is_valid",
+    [
+        pytest.param("", True, id="empty_string"),
+        pytest.param("tag1", True, id="empty_string"),
+        pytest.param("taaaag1", True, id="many_letters"),
+        pytest.param("tag1;tag2", True, id="semi_colon_distinct_1"),
+        pytest.param("tag2;tag2", False, id="semi_colon_identical"),
+        pytest.param("tag,1;tug,1", True, id="semi_colon_distinct_2"),
+        pytest.param(
+            "tag1,tag2", True, id="comma"
+        ),  # we cannot say that this ought to be a tags separator
+        pytest.param({"tag1"}, True, id="one_tag_in_set"),
+        pytest.param({"tag1", "tag2"}, True, id="two_tags_in_set"),
+        pytest.param(1, False, id="one_int"),
+        pytest.param(None, False, id="none_value"),
+        pytest.param(["tag1", "tag2"], True, id="two_distinct"),
+        pytest.param(["tag1", "tag1"], False, id="two_identical"),
+        pytest.param(["tag1", 1], False, id="int_in_list"),
+    ],
+)
+def test_validate_tags(tags, is_valid):
+    if is_valid:
+        metadata.validate_tags("Tags", tags)
+    else:
+        with pytest.raises(ValueError):
+            metadata.validate_tags("Tags", tags)