Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions src/zimscraperlib/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pathlib
import shutil
import tempfile
from collections.abc import Iterable

from zimscraperlib import logger
from zimscraperlib.constants import DEFAULT_USER_AGENT
Expand Down Expand Up @@ -111,3 +112,26 @@ def compute_descriptions(
user_description = user_description[:-1] + "…"

return (user_description, user_long_description)


def compute_tags(
default_tags: Iterable[str],
user_tags: str | None,
) -> set[str]:
"""Computes a list of tags string compliant with ZIM standard.

Based on default tags (set by the scraper) and user provided tags (usually retrived
from the CLI arguments), the function computes a tag string to be used as metadata
which is compliant with the ZIM standard. It removes duplicates and empty values,
and strip leading and trailing whitespaces.

args:
default_tags: the list of default tags always set for a given scraper
user_tags: the tags, separated by semi-colon, as given by user at CLI args

Returns a set of tags, ready to be passed to the creator
"""

return {
tag.strip() for tag in list(default_tags) + (user_tags or "").split(";") if tag
}
11 changes: 11 additions & 0 deletions src/zimscraperlib/zim/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,17 @@ def validate_tags(name: str, value: Iterable[str] | str):
or not all(isinstance(tag, str) for tag in value)
):
raise ValueError(f"Invalid type(s) for {name}")
if (
name == "Tags"
and not isinstance(value, str)
and isinstance(value, Iterable)
and len(set(value)) != len(list(value))
):
raise ValueError(f"Duplicate tags are not valid: {value}")
if name == "Tags" and isinstance(value, str):
values = value.split(";")
if len(set(values)) != len(list(values)):
raise ValueError(f"Duplicate tags are not valid: {value}")


def validate_illustrations(name: str, value: bytes):
Expand Down
11 changes: 6 additions & 5 deletions tests/download/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ def test_large_download_https(tmp_path, valid_https_url):
@pytest.mark.parametrize(
"url,video_id",
[
("https://vimeo.com/619427082", "619427082"),
("https://vimeo.com/619427082", "619427082"),
("https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", "tyekuoPZqb7BtkyNPwVHJL"),
("https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL", "tyekuoPZqb7BtkyNPwVHJL"),
],
)
def test_youtube_download_serial(url, video_id, tmp_path):
Expand All @@ -186,7 +186,7 @@ def test_youtube_download_serial(url, video_id, tmp_path):
def test_youtube_download_nowait(tmp_path):
with YoutubeDownloader(threads=1) as yt_downloader:
future = yt_downloader.download(
"https://vimeo.com/619427082",
"https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL",
BestMp4.get_options(target_dir=tmp_path),
wait=False,
)
Expand All @@ -212,10 +212,11 @@ def test_youtube_download_error():
def test_youtube_download_contextmanager(tmp_path):
with YoutubeDownloader(threads=1) as yt_downloader:
yt_downloader.download(
"https://vimeo.com/619427082", BestWebm.get_options(target_dir=tmp_path)
"https://tube.jeena.net/w/tyekuoPZqb7BtkyNPwVHJL",
BestWebm.get_options(target_dir=tmp_path),
)
assert yt_downloader.executor._shutdown
assert tmp_path.joinpath("video.mp4").exists() # videmo doesn't offer webm
assert tmp_path.joinpath("video.mp4").exists() # jeena doesn't offer webm


@pytest.fixture
Expand Down
31 changes: 30 additions & 1 deletion tests/inputs/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH as MAX_LONG_DESC_LENGTH,
)
from zimscraperlib.constants import NAME as PROJECT_NAME
from zimscraperlib.inputs import compute_descriptions, handle_user_provided_file
from zimscraperlib.inputs import (
compute_descriptions,
compute_tags,
handle_user_provided_file,
)


def test_with_none():
Expand Down Expand Up @@ -296,3 +300,28 @@ def test_description(

assert description == expected_description
assert long_description == expected_long_description


@pytest.mark.parametrize(
"default_tags, user_tags, expected_tags",
[
pytest.param(
{"tag1", "tag2"},
"tag3;tag4",
{"tag1", "tag2", "tag3", "tag4"},
id="case1",
),
pytest.param(
{" tag1", " tag2 "},
" ta:g,4;tag2 ",
{"tag1", "tag2", "ta:g,4"},
id="case2",
),
],
)
def test_compute_tags(
default_tags: set[str],
user_tags: str,
expected_tags: set[str],
):
assert compute_tags(default_tags, user_tags) == expected_tags
29 changes: 29 additions & 0 deletions tests/zim/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,32 @@ def test_validate_language_valid(name: str, value: Iterable[str] | str):
def test_validate_language_invalid(name: str, value: Iterable[str] | str):
with pytest.raises(ValueError, match=re.escape("is not ISO-639-3")):
metadata.validate_language(name, value)


@pytest.mark.parametrize(
"tags, is_valid",
[
pytest.param("", True, id="empty_string"),
pytest.param("tag1", True, id="empty_string"),
pytest.param("taaaag1", True, id="many_letters"),
pytest.param("tag1;tag2", True, id="semi_colon_distinct_1"),
pytest.param("tag2;tag2", False, id="semi_colon_identical"),
pytest.param("tag,1;tug,1", True, id="semi_colon_distinct_2"),
pytest.param(
"tag1,tag2", True, id="comma"
), # we cannot say that this ought to be a tags separator
pytest.param({"tag1"}, True, id="one_tag_in_set"),
pytest.param({"tag1", "tag2"}, True, id="two_tags_in_set"),
pytest.param(1, False, id="one_int"),
pytest.param(None, False, id="none_value"),
pytest.param(["tag1", "tag2"], True, id="two_distinct"),
pytest.param(["tag1", "tag1"], False, id="two_identical"),
pytest.param(["tag1", 1], False, id="int_in_list"),
],
)
def test_validate_tags(tags, is_valid):
if is_valid:
metadata.validate_tags("Tags", tags)
else:
with pytest.raises(ValueError):
metadata.validate_tags("Tags", tags)