Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add support for custom number of retries and user-agent in save_large_file (#278)
- Enhance save_large_file log level (#279)
- Extend image optimization to support in-memory streams(BytesIO/bytes) and dst_format param (#289)
- Automatically add \_ftindex:<yes/no> tag at creator start based on indexing configuration (#295)
- **BEHAVIOR CHANGE**: it is not possible anymore to add "Tags" metadata with Creator.add_metadata method after creator has started ; this is anyway not recommended anymore since 5.x (one should prefer Creator.config_metadata)

### Fixed

Expand Down
17 changes: 17 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
LanguageMetadata,
MetadataBase,
StandardMetadataList,
TagsMetadata,
)

DUPLICATE_EXC_STR = re.compile(
Expand Down Expand Up @@ -114,6 +115,7 @@ def __init__(
super().__init__(filename=filename)
self._metadata: dict[str, AnyMetadata] = {}
self.__indexing_configured = False
self.__indexing_value: bool = False
self.can_finish = True

self.set_mainpath(main_path)
Expand Down Expand Up @@ -142,6 +144,7 @@ def config_indexing(
raise ValueError("Not a valid ISO-639-3 language code")
super().config_indexing(indexing, language)
self.__indexing_configured = True
self.__indexing_value = indexing
Comment thread
benoit74 marked this conversation as resolved.
return self

def _log_metadata(self):
Expand Down Expand Up @@ -223,6 +226,20 @@ def start(self):
) and not self.__indexing_configured:
self.config_indexing(True, language)

ftindex_tag = f"_ftindex:{'yes' if self.__indexing_value else 'no'}"
tags_metadata = self._metadata.get(TagsMetadata.meta_name)
if isinstance(tags_metadata, TagsMetadata):
if not any(
re.sub(r"\s+", "", part).startswith("_ftindex:")
Comment thread
benoit74 marked this conversation as resolved.
for tag in tags_metadata.value
for part in tag.split(";")
):
tags_metadata.value.append(ftindex_tag)
logger.debug(f"Metadata: Tags has been altered with '{ftindex_tag}'")
else:
self._metadata[TagsMetadata.meta_name] = TagsMetadata([ftindex_tag])
logger.debug(f"Metadata: Tags has been set with '{ftindex_tag}'")

super().__enter__()

for metadata in self._metadata.values():
Expand Down
2 changes: 1 addition & 1 deletion tests/zim/test_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def add_items(creator_or_deduplicator: Any):
for zim_path in [fpath_with_dedup, fpath_without_dedup]:
reader = Archive(zim_path)

assert reader.all_entry_count == 24
assert reader.all_entry_count == 25

for html_path in [
"welcome1",
Expand Down
64 changes: 62 additions & 2 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_zim_creator(
assert reader.get_text_metadata(
"Language"
) == DEFAULT_DEV_ZIM_METADATA.Language.libzim_value.decode("UTF-8")
assert reader.get_text_metadata("Tags") == tags
assert reader.get_text_metadata("Tags") == f"{tags};_ftindex:yes"
assert reader.main_entry.get_item().path == f"{main_path}"
# make sure we have our image
assert reader.get_item("images/yahoo.png")
Expand Down Expand Up @@ -950,7 +950,7 @@ def test_metadata_extras_missing_prefix(tmp_path: pathlib.Path):
DEFAULT_DEV_ZIM_METADATA.Title.libzim_value.decode() + "Foo",
id="simple_str",
),
pytest.param("Tags", TagsMetadata(["tag1", "tag2"]), "tag1;tag2", id="tags"),
pytest.param("Source", SourceMetadata("asource"), "asource", id="source"),
],
)
def test_add_metadata(
Expand All @@ -973,3 +973,63 @@ def test_config_indexing(tmp_path: pathlib.Path):
assert Creator(tmp_path / "_.zim", "").config_indexing(True, "bam")
assert Creator(tmp_path / "_.zim", "").config_indexing(False, "bam")
assert Creator(tmp_path / "_.zim", "").config_indexing(False)


@pytest.mark.parametrize(
"indexing, expected_tag",
[
pytest.param(True, "_ftindex:yes", id="explicit_yes"),
pytest.param(False, "_ftindex:no", id="explicit_no"),
],
)
def test_start_ftindex_tag_from_explicit_config_indexing(
tmp_path: pathlib.Path, *, indexing: bool, expected_tag: str
):
fpath = tmp_path / "test.zim"
with Creator(fpath, "").config_dev_metadata().config_indexing(indexing, "fra"):
pass
tags = Archive(fpath).get_text_metadata("Tags").split(";")
assert expected_tag in tags
assert len([t for t in tags if t.startswith("_ftindex:")]) == 1


def test_start_ftindex_yes_when_auto_configured_via_language(tmp_path: pathlib.Path):
# DEFAULT_DEV_ZIM_METADATA has Language=fra but no Tags; language triggers
# auto config_indexing(True) which should result in _ftindex:yes being added.
fpath = tmp_path / "test.zim"
with Creator(fpath, "").config_dev_metadata():
pass
tags = Archive(fpath).get_text_metadata("Tags").split(";")
assert "_ftindex:yes" in tags


def test_start_ftindex_appended_to_existing_tags(tmp_path: pathlib.Path):
fpath = tmp_path / "test.zim"
with Creator(fpath, "").config_dev_metadata(
TagsMetadata(["mytag", "_pictures:no"])
):
pass
tags = Archive(fpath).get_text_metadata("Tags").split(";")
assert "_ftindex:yes" in tags
assert "mytag" in tags
assert "_pictures:no" in tags


def test_start_ftindex_not_duplicated_when_already_set(tmp_path: pathlib.Path):
fpath = tmp_path / "test.zim"
with Creator(fpath, "").config_dev_metadata(TagsMetadata(["_ftindex:no"])):
pass
tags = Archive(fpath).get_text_metadata("Tags").split(";")
# _ftindex:no was set explicitly; auto-configured indexing=True should not override
assert tags.count("_ftindex:no") == 1
assert "_ftindex:yes" not in tags


def test_start_ftindex_not_duplicated_when_set_with_spaces(tmp_path: pathlib.Path):
# " _ftindex : no" has spaces; after clean_str it becomes "_ftindex : no"
# The check must still recognise it and not add a second _ftindex tag.
fpath = tmp_path / "test.zim"
with Creator(fpath, "").config_dev_metadata(TagsMetadata([" _ftindex : no "])):
pass
tags = Archive(fpath).get_text_metadata("Tags").split(";")
assert sum(1 for t in tags if t.replace(" ", "").startswith("_ftindex:")) == 1
Loading