From 4557137ad0cb5779d680de752a590fc8f05bd1c6 Mon Sep 17 00:00:00 2001 From: JohannesBOT Date: Wed, 10 Jun 2026 13:09:01 +0200 Subject: [PATCH] feat(updater): add KavitaVolumeCoverUpdater for back-filling null volumes Introduce a new background service that periodically re-checks chapters whose volume could not be resolved at move time. - Add KavitaVolumeCoverUpdater.py to resolve null volumes via MangaDex, update ComicInfo.xml in-archive, and swap in MangaBaka volume covers - Wire updater into main.py entry point with UPDATER_ENABLED env flag - Add UPDATER_ENABLED env var to docker-compose.prod.yml - Update CronSchedule.py to schedule updater runs --- docker-compose.prod.yml | 8 + main.py | 26 ++ src/ComicInfoBuilder.py | 22 +- src/CronSchedule.py | 159 ++++++++++ src/KavitaVolumeCoverUpdater.py | 536 ++++++++++++++++++++++++++++++++ 5 files changed, 746 insertions(+), 5 deletions(-) create mode 100644 src/CronSchedule.py create mode 100644 src/KavitaVolumeCoverUpdater.py diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index a61aebf..b97f467 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -11,6 +11,14 @@ services: DELETE_SOURCE: "${DELETE_SOURCE:-true}" MATCH_PATH: "${MATCH_PATH:-/config/matches.json}" WEB_PORT: "${WEB_PORT:-8080}" + # Volume/cover back-fill updater + UPDATER_ENABLED: "${UPDATER_ENABLED:-true}" + # Cron expression: "0 19 * * 1,4" = 19:00 every Monday and Thursday + # (local time, see TZ) + UPDATER_SCHEDULE: "${UPDATER_SCHEDULE:-0 19 * * 1,4}" + UPDATER_LOG: "${UPDATER_LOG:-/config/volume_updater.log}" + # Timezone for the cron schedule — without this 19:00 means 19:00 UTC + TZ: "${TZ:-Europe/Berlin}" ports: - "${WEB_PORT:-8080}:${WEB_PORT:-8080}" volumes: diff --git a/main.py b/main.py index 87c782e..6a6a0ca 100644 --- a/main.py +++ b/main.py @@ -27,6 +27,11 @@ Environment variables MATCH_PATH default /config/matches.json WEB_PORT default 8080 (Flask web UI for matches.json) WEB_HOST default 0.0.0.0 + UPDATER_ENABLED default true (volume/cover back-fill cron) + UPDATER_SCHEDULE cron expression for the updater scans, + default "0 19 * * 1,4" = 19:00 every Mon + Thu + (local time — set TZ inside the container!) + UPDATER_LOG default /config/volume_updater.log """ from __future__ import annotations @@ -43,6 +48,7 @@ from src.SuwayomiMover import SuwayomiMover # noqa: E402 from src.SuwayomiFolderWatcher import SuwayomiFolderWatcher # noqa: E402 from src.MatchesCache import MatchesCache # noqa: E402 from src.MatchesWebApp import MatchesWebApp # noqa: E402 +from src.KavitaVolumeCoverUpdater import KavitaVolumeCoverUpdater # noqa: E402 def _env_str(name: str, default: "str | None" = None, @@ -85,6 +91,9 @@ def main() -> int: match_path = _env_str("MATCH_PATH", "/config/matches.json") web_host = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0" web_port = _env_int("WEB_PORT", 8080) + updater_enabled = _env_bool("UPDATER_ENABLED", True) + updater_schedule = _env_str("UPDATER_SCHEDULE", "0 19 * * 1,4") + updater_log = _env_str("UPDATER_LOG", "/config/volume_updater.log") print(f"[main] suwayomi = {suwayomi_path}", flush=True) print(f"[main] kavita = {kavita_path}", flush=True) @@ -112,6 +121,23 @@ def main() -> int: web_app = MatchesWebApp(matches_cache, mover=mover, host=web_host, port=web_port) web_app.start() + if updater_enabled: + try: + updater = KavitaVolumeCoverUpdater( + kavita_path, + matches_cache=matches_cache, + language=language, + request_timeout=request_timeout, + log_path=updater_log, + schedule=updater_schedule, + ) + updater.start() + except ValueError as exc: + # Invalid cron expression — keep the service up, just without + # the updater, and make the config error obvious in the logs. + print(f"[main] UPDATER_SCHEDULE invalid ({exc}); " + f"volume/cover updater DISABLED", flush=True) + # def shutdown(signum, _frame): # print(f"[main] received signal {signum}", flush=True) # watcher.stop() diff --git a/src/ComicInfoBuilder.py b/src/ComicInfoBuilder.py index b93e969..c073f32 100644 --- a/src/ComicInfoBuilder.py +++ b/src/ComicInfoBuilder.py @@ -1030,12 +1030,14 @@ class ComicInfoBuilder: return unique @staticmethod - def _read_existing_comicinfo(folder: Path) -> dict: - xml_path = folder / "ComicInfo.xml" - if not xml_path.is_file(): - return {} + def read_comicinfo_fields(xml_source) -> dict: + """ + Parses ComicInfo.xml content (bytes or str) and returns the fields + relevant as supplementary Suwayomi data. Returns {} on parse errors. + Reusable for XML read directly from a CBZ archive (no extraction). + """ try: - root = ET.parse(xml_path).getroot() + root = ET.fromstring(xml_source) except ET.ParseError: return {} @@ -1049,6 +1051,16 @@ class ComicInfoBuilder: data[tag] = child.text.strip() return data + @staticmethod + def _read_existing_comicinfo(folder: Path) -> dict: + xml_path = folder / "ComicInfo.xml" + if not xml_path.is_file(): + return {} + try: + return ComicInfoBuilder.read_comicinfo_fields(xml_path.read_bytes()) + except OSError: + return {} + @staticmethod def _image_dimensions(path: Path): if not _HAS_PIL: diff --git a/src/CronSchedule.py b/src/CronSchedule.py new file mode 100644 index 0000000..f30a86c --- /dev/null +++ b/src/CronSchedule.py @@ -0,0 +1,159 @@ +""" +cron_schedule.py +================ + +Minimal cron-expression parser — no external dependency. + +Supports the classic 5-field syntax:: + + ┌──────── minute (0-59) + │ ┌────── hour (0-23) + │ │ ┌──── day of month (1-31) + │ │ │ ┌── month (1-12 or jan-dec) + │ │ │ │ ┌ day of week (0-7 or sun-sat; 0 and 7 = Sunday) + │ │ │ │ │ + 0 19 * * 1,4 -> 19:00 every Monday and Thursday + +Field syntax: ``*``, single values, ranges (``a-b``), steps (``*/n``, +``a-b/n``) and comma lists. Month / weekday names (``jan``, ``mon``, …) +are accepted case-insensitively. + +As in Vixie cron, when *both* day-of-month and day-of-week are restricted +the job runs when **either** matches. + +Times are evaluated against the local system clock (``datetime.now()``) — +in Docker set the ``TZ`` environment variable so "19:00" means local time. +""" + +from __future__ import annotations + +from datetime import datetime, timedelta + + +_MONTH_NAMES = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, + "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12} +_DAY_NAMES = {"sun": 0, "mon": 1, "tue": 2, "wed": 3, "thu": 4, + "fri": 5, "sat": 6} + + +def _parse_value(token: str, lo: int, hi: int, + names: "dict[str, int] | None") -> int: + token = token.strip().lower() + if names and token in names: + return names[token] + try: + value = int(token) + except ValueError: + raise ValueError(f"invalid cron value {token!r}") from None + if not (lo <= value <= hi): + raise ValueError(f"cron value {value} out of range {lo}-{hi}") + return value + + +def _parse_field(field: str, lo: int, hi: int, + names: "dict[str, int] | None" = None) -> "set[int]": + """Parses one cron field into the set of matching integer values.""" + result: set[int] = set() + for part in field.split(","): + part = part.strip() + if not part: + raise ValueError(f"empty element in cron field {field!r}") + + step = 1 + if "/" in part: + part, step_text = part.split("/", 1) + try: + step = int(step_text) + except ValueError: + raise ValueError(f"invalid cron step {step_text!r}") from None + if step < 1: + raise ValueError(f"cron step must be >= 1, got {step}") + + if part == "*": + start, end = lo, hi + elif "-" in part: + a, b = part.split("-", 1) + start = _parse_value(a, lo, hi, names) + end = _parse_value(b, lo, hi, names) + if end < start: + raise ValueError(f"inverted cron range {part!r}") + else: + start = end = _parse_value(part, lo, hi, names) + + result.update(range(start, end + 1, step)) + return result + + +class CronSchedule: + """ + Parsed 5-field cron expression with ``next_after()`` evaluation. + + Usage:: + + cron = CronSchedule("0 19 * * mon,thu") + run_at = cron.next_after(datetime.now()) + """ + + def __init__(self, expression: str): + self.expression = expression.strip() + fields = self.expression.split() + if len(fields) != 5: + raise ValueError( + f"cron expression needs 5 fields " + f"(minute hour dom month dow), got {len(fields)}: " + f"{expression!r}") + + minute, hour, dom, month, dow = fields + self._minutes = _parse_field(minute, 0, 59) + self._hours = _parse_field(hour, 0, 23) + self._dom = _parse_field(dom, 1, 31) + self._months = _parse_field(month, 1, 12, _MONTH_NAMES) + dow_values = _parse_field(dow, 0, 7, _DAY_NAMES) + # 7 is an alias for Sunday (= 0) + self._dow = {0 if v == 7 else v for v in dow_values} + + # Vixie-cron rule: dom/dow are OR-combined when both are restricted. + self._dom_restricted = dom != "*" + self._dow_restricted = dow != "*" + + def __repr__(self) -> str: + return f"CronSchedule({self.expression!r})" + + # ------------------------------------------------------------------ + def _day_matches(self, day: "datetime.date") -> bool: + if day.month not in self._months: + return False + dom_ok = day.day in self._dom + # Python: Monday=0 … Sunday=6 -> cron: Sunday=0 … Saturday=6 + dow_ok = ((day.weekday() + 1) % 7) in self._dow + if self._dom_restricted and self._dow_restricted: + return dom_ok or dow_ok + if self._dom_restricted: + return dom_ok + if self._dow_restricted: + return dow_ok + return True + + def next_after(self, dt: datetime) -> datetime: + """ + Returns the first matching time strictly after ``dt`` + (second/microsecond precision is dropped). + """ + cand = (dt + timedelta(minutes=1)).replace(second=0, microsecond=0) + hours = sorted(self._hours) + minutes = sorted(self._minutes) + + # Walk day by day (covers rare dom/month combos like Feb 29). + for _ in range(366 * 5): + if self._day_matches(cand.date()): + for h in hours: + if h < cand.hour: + continue + for m in minutes: + if h == cand.hour and m < cand.minute: + continue + return cand.replace(hour=h, minute=m) + cand = (cand + timedelta(days=1)).replace(hour=0, minute=0) + + raise ValueError( + f"cron {self.expression!r}: no occurrence within 5 years") diff --git a/src/KavitaVolumeCoverUpdater.py b/src/KavitaVolumeCoverUpdater.py new file mode 100644 index 0000000..10828bf --- /dev/null +++ b/src/KavitaVolumeCoverUpdater.py @@ -0,0 +1,536 @@ +""" +kavita_volume_cover_updater.py +============================== + +Periodically re-checks chapters already moved to the Kavita library whose +volume could not be resolved at move time (``"volume": null`` in the +series' ``chapter_index.json``). + +When MangaDex has since assigned the chapter to a volume, the updater: + + 1. writes the volume into ``chapter_index.json``, + 2. updates ```` inside the chapter's ComicInfo.xml (in-archive), + 3. downloads the MangaBaka volume cover and swaps it in for the + placeholder ``000.`` series cover, and + 4. refreshes the *first* chapter's ComicInfo.xml with full metadata — + Kavita can be configured to take series metadata from the lowest + chapter, so it must reflect the latest state. + +Host-IO policy +-------------- +* Per series only ``chapter_index.json`` is read (no archive is opened to + discover its contents). +* Series without null-volume chapters are skipped before any API call. +* An archive is read+rewritten exactly once per update (single pass, + written to a ``.tmp`` file, then atomically replaced). + +Every updated chapter is appended to a log file (one line per update). + +Reused components +----------------- +* ``SuwayomiMover`` — chapter index helpers, dirname sanitizer +* ``ComicInfoBuilder`` — metadata fetch (matches-cache ID lookup), + chapter→volume resolution, XML build +* ``MangaBakaWorksResolver`` — volume covers (/images with /works fallback) +* ``MangaDexVolumeResolver`` — chapter→volume aggregate (shared cache) +* ``MangaBakaRateLimit`` — process-wide API throttle + +Dependencies +------------ + requests -> pip install requests + Pillow -> pip install pillow (optional, page-0 dimensions) +""" + +from __future__ import annotations + +import io +import threading +import xml.etree.ElementTree as ET +import zipfile +from datetime import datetime +from pathlib import Path + +import requests + +from ComicInfoBuilder import (ComicInfoBuilder, _guess_extension, _IMAGE_EXTS) +from MangadexVolumeResolver import MangaDexVolumeResolver +from MangaBakaWorksResolver import MangaBakaWorksResolver +from MALResolver import MALResolver +from AniListResolver import AniListResolver +from MatchesCache import MatchesCache +from SuwayomiMover import (_load_chapter_index, _save_chapter_index, + _sanitize_dirname, _normalise_volume_value) +from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit +from CronSchedule import CronSchedule + +try: + from PIL import Image + _HAS_PIL = True +except ImportError: + _HAS_PIL = False + + +def _now() -> str: + return datetime.now().isoformat(timespec="seconds") + + +def _image_dims_from_bytes(data: bytes) -> tuple: + """Returns (width, height) of an image byte blob, or (None, None).""" + if not _HAS_PIL: + return (None, None) + try: + with Image.open(io.BytesIO(data)) as im: + return im.size + except Exception: + return (None, None) + + +def _chapter_sort_value(num: str) -> float: + try: + return float(num) + except (TypeError, ValueError): + return float("inf") + + +def _update_page0_attrs(pages_el: "ET.Element", cover_bytes: bytes) -> None: + """Refreshes size/dimension attributes of the FrontCover page entry.""" + for page in pages_el: + if page.get("Image") == "0": + page.set("ImageSize", str(len(cover_bytes))) + width, height = _image_dims_from_bytes(cover_bytes) + if width and height: + page.set("ImageWidth", str(width)) + page.set("ImageHeight", str(height)) + return + + +def _serialize_tree(root: "ET.Element") -> str: + tree = ET.ElementTree(root) + try: + ET.indent(tree, space=" ") + except AttributeError: + pass + return ('\n' + + ET.tostring(root, encoding="unicode")) + + +class KavitaVolumeCoverUpdater: + """ + Scans the Kavita library for chapters whose volume was unknown at move + time and back-fills volume + volume cover once MangaDex / MangaBaka + provide the data. Runs periodically on a background thread. + + Parameters + ---------- + kavita_path : Root of the Kavita library (series folders inside). + matches_cache : MatchesCache — provides the MangaBaka series ID per + series (mandatory; folders without a match are skipped). + language : ComicInfo language (passed to ComicInfoBuilder). + request_timeout : HTTP timeout in seconds. + log_path : File that receives one line per updated chapter. + Default: /volume_updater.log + schedule : Cron expression (5 fields) defining when scans run, + e.g. "0 19 * * 1,4" = 19:00 every Monday and + Thursday. Evaluated in local time — set the TZ env + var inside Docker. Default: "0 19 * * 1,4". + """ + + def __init__(self, + kavita_path, + *, + matches_cache: MatchesCache, + language: str = "en", + request_timeout: int = 30, + api_base_url: str = "https://api.mangabaka.dev/v1", + log_path=None, + schedule: str = "0 19 * * 1,4"): + self._dst = Path(kavita_path) + self._matches_cache = matches_cache + self._language = language + self._timeout = request_timeout + self._api_base_url = api_base_url.rstrip("/") + self._log_path = (Path(log_path) if log_path + else self._dst / "volume_updater.log") + self._cron = CronSchedule(schedule) + + session = requests.Session() + session.headers.setdefault("User-Agent", "KavitaVolumeCoverUpdater/1.0") + _apply_mangabaka_rate_limit(session) + self._session = session + + self._mal = MALResolver(request_timeout=request_timeout) + self._al = AniListResolver(request_timeout=request_timeout) + self._vol_resolver = MangaDexVolumeResolver( + request_timeout=request_timeout, session=session) + self._works_resolver = MangaBakaWorksResolver( + api_base_url=api_base_url, + request_timeout=request_timeout, session=session) + + self._stop = threading.Event() + self._thread: "threading.Thread | None" = None + + # ------------------------------------------------------------------ + # Cron API (mirrors SuwayomiFolderWatcher) + # ------------------------------------------------------------------ + def start(self) -> None: + """Starts the periodic scan thread. Non-blocking.""" + if self._thread is not None and self._thread.is_alive(): + return + self._stop.clear() + self._thread = threading.Thread( + target=self._loop, name="KavitaVolumeCoverUpdater", daemon=True) + self._thread.start() + print(f"[{_now()}] [updater] scanning {self._dst} " + f"on cron '{self._cron.expression}'", flush=True) + + def stop(self) -> None: + """Stops the scan thread (current scan finishes its series first).""" + self._stop.set() + if self._thread is not None: + self._thread.join(timeout=10) + + def wait(self) -> None: + """Blocks the calling thread until stop() is invoked.""" + self._stop.wait() + + def _loop(self) -> None: + while not self._stop.is_set(): + next_run = self._cron.next_after(datetime.now()) + wait = max(0.0, (next_run - datetime.now()).total_seconds()) + print(f"[{_now()}] [updater] next scheduled scan: " + f"{next_run.isoformat(timespec='minutes')}", flush=True) + if self._stop.wait(wait): + break + + try: + summary = self.update_all() + print(f"[{_now()}] [updater] scan done: " + f"{summary['series_updated']} series / " + f"{summary['chapters_updated']} chapters updated", + flush=True) + except Exception as exc: + print(f"[{_now()}] [updater] scan ERROR: {exc}", flush=True) + + # ------------------------------------------------------------------ + # Public scan API + # ------------------------------------------------------------------ + def update_all(self) -> dict: + """ + Scans every series folder under the Kavita root once. + Returns {"series_scanned": n, "series_updated": n, "chapters_updated": n}. + """ + summary = {"series_scanned": 0, "series_updated": 0, + "chapters_updated": 0} + if not self._dst.is_dir(): + print(f"[updater] kavita path missing: {self._dst}", flush=True) + return summary + + for series_dir in sorted(self._dst.iterdir()): + if self._stop.is_set(): + break + if not series_dir.is_dir(): + continue + summary["series_scanned"] += 1 + try: + updated = self.update_series(series_dir) + except Exception as exc: + print(f"[updater] {series_dir.name}: ERROR {exc}", flush=True) + continue + if updated: + summary["series_updated"] += 1 + summary["chapters_updated"] += updated + return summary + + def update_series(self, series_dir: Path) -> int: + """ + Updates one series folder. Returns the number of updated chapters. + + Only chapters listed in chapter_index.json with ``"volume": null`` + are candidates; everything else costs no further host reads. + """ + index = _load_chapter_index(series_dir) + chapters: dict = index["chapter"] + if not chapters: + return 0 + + missing = [num for num, e in chapters.items() + if isinstance(e, dict) and e.get("volume") is None] + if not missing: + return 0 + + match_key, match = self._find_match_for_folder(series_dir.name) + if not match or not match.get("mangabakaId"): + print(f"[updater] {series_dir.name}: no matches.json entry — skip", + flush=True) + return 0 + + # Builder resolves metadata via the cached MangaBaka ID and gives us + # the exact same chapter→volume logic the mover uses. + builder = ComicInfoBuilder( + match_key, chapter=missing[0], + api_base_url=self._api_base_url, + language=self._language, + request_timeout=self._timeout, + session=self._session, + volume_resolver=self._vol_resolver, + works_resolver=self._works_resolver, + mal_resolver=self._mal, + al_resolver=self._al, + matches_cache=self._matches_cache, + ) + md = builder.fetch_metadata() + series_id = str(md.get("id") or "") + + # Resolve volumes for all null-volume chapters first (API only). + updates: dict[str, dict] = {} # num -> {"volume": str, "cover": tuple|None} + for num in sorted(missing, key=_chapter_sort_value): + builder.chapter = num + try: + volume = builder._determine_volume() + except Exception: + volume = None + if not volume: + continue + updates[num] = {"volume": volume, + "cover": self._fetch_cover(series_id, volume)} + + if not updates: + return 0 + + first = min(chapters, key=_chapter_sort_value) + updated = 0 + + for num, up in updates.items(): + entry = chapters[num] + cbz = series_dir / (entry.get("archiveName") or "") + if not entry.get("archiveName") or not cbz.is_file(): + print(f"[updater] {series_dir.name} ch.{num}: archive missing " + f"({entry.get('archiveName')!r}) — skip", flush=True) + continue + # The first chapter gets a full metadata rebuild (Kavita reads + # series metadata from it); other chapters only a volume edit. + ok, cover_swapped = self._apply_update( + cbz, builder, num, + volume=up["volume"], cover=up["cover"], + full_rebuild=(num == first)) + if not ok: + continue + entry["volume"] = _normalise_volume_value(up["volume"]) + updated += 1 + self._log(f"{series_dir.name} | chapter {num} -> volume " + f"{up['volume']} | cover " + f"{'replaced' if cover_swapped else 'kept'} | {cbz.name}") + + # Refresh the first chapter's metadata when any other chapter changed + # (skip when it was already fully rebuilt in the loop above). + if updated and first not in updates: + first_entry = chapters.get(first) or {} + cbz = series_dir / (first_entry.get("archiveName") or "") + if first_entry.get("archiveName") and cbz.is_file(): + ok, _ = self._apply_update( + cbz, builder, first, + volume=None, cover=None, full_rebuild=True) + if ok: + self._log(f"{series_dir.name} | chapter {first} | " + f"first-chapter metadata refreshed | {cbz.name}") + + if updated: + _save_chapter_index(series_dir, index) + return updated + + # ------------------------------------------------------------------ + # Matching Kavita folder -> matches.json entry + # ------------------------------------------------------------------ + def _find_match_for_folder(self, folder_name: str) -> tuple: + """ + Maps a Kavita series folder back to its matches.json entry. + + The folder was created as ``_sanitize_dirname(mangabaka_title)``, so + the comparison sanitizes each entry's mangabakaName the same way. + Falls back to the folderTitle (Suwayomi name) for robustness. + Returns (match_key, entry) or (None, None). + """ + target = folder_name.strip().casefold() + matches = self._matches_cache.all()["matches"] + for key, entry in matches.items(): + name = entry.get("mangabakaName") or "" + if name and _sanitize_dirname(name).strip().casefold() == target: + return key, entry + for key, entry in matches.items(): + folder = entry.get("folderTitle") or key + if _sanitize_dirname(folder).strip().casefold() == target: + return key, entry + return None, None + + # ------------------------------------------------------------------ + # Cover download + # ------------------------------------------------------------------ + def _fetch_cover(self, series_id: str, volume) -> "tuple[str, bytes] | None": + """ + Downloads the MangaBaka volume cover. + Returns ("000", bytes) or None when no cover is available. + """ + try: + url = self._works_resolver.get_cover_for_volume(series_id, volume) + except Exception: + url = None + if not url: + return None + try: + resp = self._session.get(url, timeout=self._timeout) + resp.raise_for_status() + except requests.RequestException: + return None + ext = _guess_extension(url, resp.headers.get("Content-Type", "")) + return (f"000{ext}", resp.content) + + # ------------------------------------------------------------------ + # Archive update (single read + single write per archive) + # ------------------------------------------------------------------ + def _apply_update(self, cbz_path: Path, builder: ComicInfoBuilder, + chapter_num: str, *, + volume, cover, full_rebuild: bool) -> tuple: + """ + Rewrites one CBZ archive with an updated ComicInfo.xml and (when + provided and a placeholder exists) a new cover image. + + Returns (ok, cover_swapped). + """ + try: + with zipfile.ZipFile(cbz_path, "r") as zin: + try: + old_xml = zin.read("ComicInfo.xml") + except KeyError: + old_xml = None + + if full_rebuild or old_xml is None: + new_xml = self._build_full_xml( + builder, chapter_num, old_xml, cover) + else: + new_xml = self._edit_volume_xml(old_xml, volume, cover) + if new_xml is None: # parse error -> full rebuild + new_xml = self._build_full_xml( + builder, chapter_num, None, cover) + + infos = zin.infolist() + # Cover is only ever *replaced*: inserting one would shift + # every image index in the existing XML. + has_placeholder = any( + Path(i.filename).stem == "000" + and Path(i.filename).suffix.lower() in _IMAGE_EXTS + for i in infos) + swap_cover = cover is not None and has_placeholder + + tmp = cbz_path.with_suffix(cbz_path.suffix + ".tmp") + wrote_xml = False + with zipfile.ZipFile(tmp, "w", zipfile.ZIP_STORED) as zout: + for info in infos: + p = Path(info.filename) + if (swap_cover and p.stem == "000" + and p.suffix.lower() in _IMAGE_EXTS): + zout.writestr(cover[0], cover[1]) + elif info.filename == "ComicInfo.xml": + zout.writestr("ComicInfo.xml", new_xml) + wrote_xml = True + else: + zout.writestr(info, zin.read(info.filename)) + if not wrote_xml: + zout.writestr("ComicInfo.xml", new_xml) + tmp.replace(cbz_path) + return True, swap_cover + except Exception as exc: + print(f"[updater] {cbz_path.name}: update failed: {exc}", + flush=True) + return False, False + + # ------------------------------------------------------------------ + # XML builders + # ------------------------------------------------------------------ + def _edit_volume_xml(self, old_xml: bytes, volume, + cover) -> "str | None": + """ + Sets in an existing ComicInfo.xml and refreshes the + FrontCover page attributes when the cover gets replaced. + Returns None when the XML is unparseable. + """ + try: + root = ET.fromstring(old_xml) + except ET.ParseError: + return None + + el = root.find("Volume") + if el is None: + el = ET.SubElement(root, "Volume") + el.text = str(volume) + + if cover is not None: + pages = root.find("Pages") + if pages is not None: + _update_page0_attrs(pages, cover[1]) + + return _serialize_tree(root) + + def _build_full_xml(self, builder: ComicInfoBuilder, chapter_num: str, + old_xml: "bytes | None", cover) -> str: + """ + Rebuilds the complete ComicInfo.xml via ComicInfoBuilder (fresh + MangaBaka/MAL metadata). Suwayomi-derived fields and the + section are carried over from the previous XML. + """ + builder.chapter = chapter_num # also clears builder page state + builder._suwayomi_data = ( + ComicInfoBuilder.read_comicinfo_fields(old_xml) if old_xml else {}) + root = builder._build_tree().getroot() + + if old_xml: + try: + old_root = ET.fromstring(old_xml) + except ET.ParseError: + old_root = None + if old_root is not None: + pages = old_root.find("Pages") + if pages is not None and cover is not None: + _update_page0_attrs(pages, cover[1]) + page_count = old_root.find("PageCount") + if page_count is not None: + root.append(page_count) + if pages is not None: + root.append(pages) + + return _serialize_tree(root) + + # ------------------------------------------------------------------ + # Logging + # ------------------------------------------------------------------ + def _log(self, msg: str) -> None: + line = f"[{_now()}] {msg}" + print(f"[updater] {msg}", flush=True) + try: + self._log_path.parent.mkdir(parents=True, exist_ok=True) + with self._log_path.open("a", encoding="utf-8") as f: + f.write(line + "\n") + except OSError as exc: + print(f"[updater] cannot write log file {self._log_path}: {exc}", + flush=True) + + +# -------------------------------------------------------------------------- +# Usage example +# -------------------------------------------------------------------------- +if __name__ == "__main__": + # Local (no-Docker) smoke test. Adjust paths to your environment. + KAVITA_PATH = r"\\192.168.2.2\root\ServerData\Kavita\test" + MATCHES_PATH = Path(__file__).resolve().parent.parent / "matches.json" + + updater = KavitaVolumeCoverUpdater( + KAVITA_PATH, + matches_cache=MatchesCache(MATCHES_PATH), + ) + + # One-shot scan (no cron thread): + summary = updater.update_all() + print(f"\n[updater] {summary}") + + # Or run on the cron schedule (default: 19:00 every Mon + Thu): + # updater.start() + # updater.wait()