From 4557137ad0cb5779d680de752a590fc8f05bd1c6 Mon Sep 17 00:00:00 2001
From: JohannesBOT <maxb12032005@gmail.com>
Date: Wed, 10 Jun 2026 13:09:01 +0200
Subject: [PATCH] feat(updater): add KavitaVolumeCoverUpdater for back-filling
 null volumes

Introduce a new background service that periodically re-checks chapters
whose volume could not be resolved at move time.

- Add KavitaVolumeCoverUpdater.py to resolve null volumes via MangaDex,
  update ComicInfo.xml in-archive, and swap in MangaBaka volume covers
- Wire updater into main.py entry point with UPDATER_ENABLED env flag
- Add UPDATER_ENABLED env var to docker-compose.prod.yml
- Update CronSchedule.py to schedule updater runs
---
 docker-compose.prod.yml         |   8 +
 main.py                         |  26 ++
 src/ComicInfoBuilder.py         |  22 +-
 src/CronSchedule.py             | 159 ++++++++++
 src/KavitaVolumeCoverUpdater.py | 536 ++++++++++++++++++++++++++++++++
 5 files changed, 746 insertions(+), 5 deletions(-)
 create mode 100644 src/CronSchedule.py
 create mode 100644 src/KavitaVolumeCoverUpdater.py

diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
index a61aebf..b97f467 100644
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -11,6 +11,14 @@ services:
       DELETE_SOURCE:  "${DELETE_SOURCE:-true}"
       MATCH_PATH:     "${MATCH_PATH:-/config/matches.json}"
       WEB_PORT:       "${WEB_PORT:-8080}"
+      # Volume/cover back-fill updater
+      UPDATER_ENABLED:  "${UPDATER_ENABLED:-true}"
+      # Cron expression: "0 19 * * 1,4" = 19:00 every Monday and Thursday
+      # (local time, see TZ)
+      UPDATER_SCHEDULE: "${UPDATER_SCHEDULE:-0 19 * * 1,4}"
+      UPDATER_LOG:      "${UPDATER_LOG:-/config/volume_updater.log}"
+      # Timezone for the cron schedule — without this 19:00 means 19:00 UTC
+      TZ:               "${TZ:-Europe/Berlin}"
     ports:
       - "${WEB_PORT:-8080}:${WEB_PORT:-8080}"
     volumes:
diff --git a/main.py b/main.py
index 87c782e..6a6a0ca 100644
--- a/main.py
+++ b/main.py
@@ -27,6 +27,11 @@ Environment variables
     MATCH_PATH          default /config/matches.json
     WEB_PORT            default 8080  (Flask web UI for matches.json)
     WEB_HOST            default 0.0.0.0
+    UPDATER_ENABLED     default true  (volume/cover back-fill cron)
+    UPDATER_SCHEDULE    cron expression for the updater scans,
+                        default "0 19 * * 1,4" = 19:00 every Mon + Thu
+                        (local time — set TZ inside the container!)
+    UPDATER_LOG         default /config/volume_updater.log
 """
 
 from __future__ import annotations
@@ -43,6 +48,7 @@ from src.SuwayomiMover import SuwayomiMover                       # noqa: E402
 from src.SuwayomiFolderWatcher import SuwayomiFolderWatcher       # noqa: E402
 from src.MatchesCache import MatchesCache                          # noqa: E402
 from src.MatchesWebApp import MatchesWebApp                        # noqa: E402
+from src.KavitaVolumeCoverUpdater import KavitaVolumeCoverUpdater  # noqa: E402
 
 
 def _env_str(name: str, default: "str | None" = None,
@@ -85,6 +91,9 @@ def main() -> int:
     match_path      = _env_str("MATCH_PATH", "/config/matches.json")
     web_host        = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0"
     web_port        = _env_int("WEB_PORT", 8080)
+    updater_enabled  = _env_bool("UPDATER_ENABLED", True)
+    updater_schedule = _env_str("UPDATER_SCHEDULE", "0 19 * * 1,4")
+    updater_log      = _env_str("UPDATER_LOG", "/config/volume_updater.log")
 
     print(f"[main] suwayomi  = {suwayomi_path}",  flush=True)
     print(f"[main] kavita    = {kavita_path}",    flush=True)
@@ -112,6 +121,23 @@ def main() -> int:
     web_app = MatchesWebApp(matches_cache, mover=mover, host=web_host, port=web_port)
     web_app.start()
 
+    if updater_enabled:
+        try:
+            updater = KavitaVolumeCoverUpdater(
+                kavita_path,
+                matches_cache=matches_cache,
+                language=language,
+                request_timeout=request_timeout,
+                log_path=updater_log,
+                schedule=updater_schedule,
+            )
+            updater.start()
+        except ValueError as exc:
+            # Invalid cron expression — keep the service up, just without
+            # the updater, and make the config error obvious in the logs.
+            print(f"[main] UPDATER_SCHEDULE invalid ({exc}); "
+                  f"volume/cover updater DISABLED", flush=True)
+
     # def shutdown(signum, _frame):
     #     print(f"[main] received signal {signum}", flush=True)
     #     watcher.stop()
diff --git a/src/ComicInfoBuilder.py b/src/ComicInfoBuilder.py
index b93e969..c073f32 100644
--- a/src/ComicInfoBuilder.py
+++ b/src/ComicInfoBuilder.py
@@ -1030,12 +1030,14 @@ class ComicInfoBuilder:
         return unique
 
     @staticmethod
-    def _read_existing_comicinfo(folder: Path) -> dict:
-        xml_path = folder / "ComicInfo.xml"
-        if not xml_path.is_file():
-            return {}
+    def read_comicinfo_fields(xml_source) -> dict:
+        """
+        Parses ComicInfo.xml content (bytes or str) and returns the fields
+        relevant as supplementary Suwayomi data.  Returns {} on parse errors.
+        Reusable for XML read directly from a CBZ archive (no extraction).
+        """
         try:
-            root = ET.parse(xml_path).getroot()
+            root = ET.fromstring(xml_source)
         except ET.ParseError:
             return {}
 
@@ -1049,6 +1051,16 @@ class ComicInfoBuilder:
                 data[tag] = child.text.strip()
         return data
 
+    @staticmethod
+    def _read_existing_comicinfo(folder: Path) -> dict:
+        xml_path = folder / "ComicInfo.xml"
+        if not xml_path.is_file():
+            return {}
+        try:
+            return ComicInfoBuilder.read_comicinfo_fields(xml_path.read_bytes())
+        except OSError:
+            return {}
+
     @staticmethod
     def _image_dimensions(path: Path):
         if not _HAS_PIL:
diff --git a/src/CronSchedule.py b/src/CronSchedule.py
new file mode 100644
index 0000000..f30a86c
--- /dev/null
+++ b/src/CronSchedule.py
@@ -0,0 +1,159 @@
+"""
+cron_schedule.py
+================
+
+Minimal cron-expression parser — no external dependency.
+
+Supports the classic 5-field syntax::
+
+    ┌──────── minute        (0-59)
+    │ ┌────── hour          (0-23)
+    │ │ ┌──── day of month  (1-31)
+    │ │ │ ┌── month         (1-12 or jan-dec)
+    │ │ │ │ ┌ day of week   (0-7 or sun-sat; 0 and 7 = Sunday)
+    │ │ │ │ │
+    0 19 * * 1,4    ->  19:00 every Monday and Thursday
+
+Field syntax: ``*``, single values, ranges (``a-b``), steps (``*/n``,
+``a-b/n``) and comma lists.  Month / weekday names (``jan``, ``mon``, …)
+are accepted case-insensitively.
+
+As in Vixie cron, when *both* day-of-month and day-of-week are restricted
+the job runs when **either** matches.
+
+Times are evaluated against the local system clock (``datetime.now()``) —
+in Docker set the ``TZ`` environment variable so "19:00" means local time.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta
+
+
+_MONTH_NAMES = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
+                "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
+_DAY_NAMES   = {"sun": 0, "mon": 1, "tue": 2, "wed": 3, "thu": 4,
+                "fri": 5, "sat": 6}
+
+
+def _parse_value(token: str, lo: int, hi: int,
+                 names: "dict[str, int] | None") -> int:
+    token = token.strip().lower()
+    if names and token in names:
+        return names[token]
+    try:
+        value = int(token)
+    except ValueError:
+        raise ValueError(f"invalid cron value {token!r}") from None
+    if not (lo <= value <= hi):
+        raise ValueError(f"cron value {value} out of range {lo}-{hi}")
+    return value
+
+
+def _parse_field(field: str, lo: int, hi: int,
+                 names: "dict[str, int] | None" = None) -> "set[int]":
+    """Parses one cron field into the set of matching integer values."""
+    result: set[int] = set()
+    for part in field.split(","):
+        part = part.strip()
+        if not part:
+            raise ValueError(f"empty element in cron field {field!r}")
+
+        step = 1
+        if "/" in part:
+            part, step_text = part.split("/", 1)
+            try:
+                step = int(step_text)
+            except ValueError:
+                raise ValueError(f"invalid cron step {step_text!r}") from None
+            if step < 1:
+                raise ValueError(f"cron step must be >= 1, got {step}")
+
+        if part == "*":
+            start, end = lo, hi
+        elif "-" in part:
+            a, b = part.split("-", 1)
+            start = _parse_value(a, lo, hi, names)
+            end   = _parse_value(b, lo, hi, names)
+            if end < start:
+                raise ValueError(f"inverted cron range {part!r}")
+        else:
+            start = end = _parse_value(part, lo, hi, names)
+
+        result.update(range(start, end + 1, step))
+    return result
+
+
+class CronSchedule:
+    """
+    Parsed 5-field cron expression with ``next_after()`` evaluation.
+
+    Usage::
+
+        cron = CronSchedule("0 19 * * mon,thu")
+        run_at = cron.next_after(datetime.now())
+    """
+
+    def __init__(self, expression: str):
+        self.expression = expression.strip()
+        fields = self.expression.split()
+        if len(fields) != 5:
+            raise ValueError(
+                f"cron expression needs 5 fields "
+                f"(minute hour dom month dow), got {len(fields)}: "
+                f"{expression!r}")
+
+        minute, hour, dom, month, dow = fields
+        self._minutes = _parse_field(minute, 0, 59)
+        self._hours   = _parse_field(hour,   0, 23)
+        self._dom     = _parse_field(dom,    1, 31)
+        self._months  = _parse_field(month,  1, 12, _MONTH_NAMES)
+        dow_values    = _parse_field(dow,    0, 7,  _DAY_NAMES)
+        # 7 is an alias for Sunday (= 0)
+        self._dow = {0 if v == 7 else v for v in dow_values}
+
+        # Vixie-cron rule: dom/dow are OR-combined when both are restricted.
+        self._dom_restricted = dom != "*"
+        self._dow_restricted = dow != "*"
+
+    def __repr__(self) -> str:
+        return f"CronSchedule({self.expression!r})"
+
+    # ------------------------------------------------------------------
+    def _day_matches(self, day: "datetime.date") -> bool:
+        if day.month not in self._months:
+            return False
+        dom_ok = day.day in self._dom
+        # Python: Monday=0 … Sunday=6  ->  cron: Sunday=0 … Saturday=6
+        dow_ok = ((day.weekday() + 1) % 7) in self._dow
+        if self._dom_restricted and self._dow_restricted:
+            return dom_ok or dow_ok
+        if self._dom_restricted:
+            return dom_ok
+        if self._dow_restricted:
+            return dow_ok
+        return True
+
+    def next_after(self, dt: datetime) -> datetime:
+        """
+        Returns the first matching time strictly after ``dt``
+        (second/microsecond precision is dropped).
+        """
+        cand = (dt + timedelta(minutes=1)).replace(second=0, microsecond=0)
+        hours   = sorted(self._hours)
+        minutes = sorted(self._minutes)
+
+        # Walk day by day (covers rare dom/month combos like Feb 29).
+        for _ in range(366 * 5):
+            if self._day_matches(cand.date()):
+                for h in hours:
+                    if h < cand.hour:
+                        continue
+                    for m in minutes:
+                        if h == cand.hour and m < cand.minute:
+                            continue
+                        return cand.replace(hour=h, minute=m)
+            cand = (cand + timedelta(days=1)).replace(hour=0, minute=0)
+
+        raise ValueError(
+            f"cron {self.expression!r}: no occurrence within 5 years")
diff --git a/src/KavitaVolumeCoverUpdater.py b/src/KavitaVolumeCoverUpdater.py
new file mode 100644
index 0000000..10828bf
--- /dev/null
+++ b/src/KavitaVolumeCoverUpdater.py
@@ -0,0 +1,536 @@
+"""
+kavita_volume_cover_updater.py
+==============================
+
+Periodically re-checks chapters already moved to the Kavita library whose
+volume could not be resolved at move time (``"volume": null`` in the
+series' ``chapter_index.json``).
+
+When MangaDex has since assigned the chapter to a volume, the updater:
+
+  1. writes the volume into ``chapter_index.json``,
+  2. updates ``<Volume>`` inside the chapter's ComicInfo.xml (in-archive),
+  3. downloads the MangaBaka volume cover and swaps it in for the
+     placeholder ``000.<ext>`` series cover, and
+  4. refreshes the *first* chapter's ComicInfo.xml with full metadata —
+     Kavita can be configured to take series metadata from the lowest
+     chapter, so it must reflect the latest state.
+
+Host-IO policy
+--------------
+* Per series only ``chapter_index.json`` is read (no archive is opened to
+  discover its contents).
+* Series without null-volume chapters are skipped before any API call.
+* An archive is read+rewritten exactly once per update (single pass,
+  written to a ``.tmp`` file, then atomically replaced).
+
+Every updated chapter is appended to a log file (one line per update).
+
+Reused components
+-----------------
+* ``SuwayomiMover``            — chapter index helpers, dirname sanitizer
+* ``ComicInfoBuilder``         — metadata fetch (matches-cache ID lookup),
+                                 chapter→volume resolution, XML build
+* ``MangaBakaWorksResolver``   — volume covers (/images with /works fallback)
+* ``MangaDexVolumeResolver``   — chapter→volume aggregate (shared cache)
+* ``MangaBakaRateLimit``       — process-wide API throttle
+
+Dependencies
+------------
+    requests    ->  pip install requests
+    Pillow      ->  pip install pillow   (optional, page-0 dimensions)
+"""
+
+from __future__ import annotations
+
+import io
+import threading
+import xml.etree.ElementTree as ET
+import zipfile
+from datetime import datetime
+from pathlib import Path
+
+import requests
+
+from ComicInfoBuilder import (ComicInfoBuilder, _guess_extension, _IMAGE_EXTS)
+from MangadexVolumeResolver import MangaDexVolumeResolver
+from MangaBakaWorksResolver import MangaBakaWorksResolver
+from MALResolver import MALResolver
+from AniListResolver import AniListResolver
+from MatchesCache import MatchesCache
+from SuwayomiMover import (_load_chapter_index, _save_chapter_index,
+                           _sanitize_dirname, _normalise_volume_value)
+from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit
+from CronSchedule import CronSchedule
+
+try:
+    from PIL import Image
+    _HAS_PIL = True
+except ImportError:
+    _HAS_PIL = False
+
+
+def _now() -> str:
+    return datetime.now().isoformat(timespec="seconds")
+
+
+def _image_dims_from_bytes(data: bytes) -> tuple:
+    """Returns (width, height) of an image byte blob, or (None, None)."""
+    if not _HAS_PIL:
+        return (None, None)
+    try:
+        with Image.open(io.BytesIO(data)) as im:
+            return im.size
+    except Exception:
+        return (None, None)
+
+
+def _chapter_sort_value(num: str) -> float:
+    try:
+        return float(num)
+    except (TypeError, ValueError):
+        return float("inf")
+
+
+def _update_page0_attrs(pages_el: "ET.Element", cover_bytes: bytes) -> None:
+    """Refreshes size/dimension attributes of the FrontCover page entry."""
+    for page in pages_el:
+        if page.get("Image") == "0":
+            page.set("ImageSize", str(len(cover_bytes)))
+            width, height = _image_dims_from_bytes(cover_bytes)
+            if width and height:
+                page.set("ImageWidth", str(width))
+                page.set("ImageHeight", str(height))
+            return
+
+
+def _serialize_tree(root: "ET.Element") -> str:
+    tree = ET.ElementTree(root)
+    try:
+        ET.indent(tree, space="  ")
+    except AttributeError:
+        pass
+    return ('<?xml version="1.0" encoding="UTF-8"?>\n'
+            + ET.tostring(root, encoding="unicode"))
+
+
+class KavitaVolumeCoverUpdater:
+    """
+    Scans the Kavita library for chapters whose volume was unknown at move
+    time and back-fills volume + volume cover once MangaDex / MangaBaka
+    provide the data.  Runs periodically on a background thread.
+
+    Parameters
+    ----------
+    kavita_path      : Root of the Kavita library (series folders inside).
+    matches_cache    : MatchesCache — provides the MangaBaka series ID per
+                       series (mandatory; folders without a match are skipped).
+    language         : ComicInfo language (passed to ComicInfoBuilder).
+    request_timeout  : HTTP timeout in seconds.
+    log_path         : File that receives one line per updated chapter.
+                       Default: <kavita_path>/volume_updater.log
+    schedule         : Cron expression (5 fields) defining when scans run,
+                       e.g. "0 19 * * 1,4" = 19:00 every Monday and
+                       Thursday.  Evaluated in local time — set the TZ env
+                       var inside Docker.  Default: "0 19 * * 1,4".
+    """
+
+    def __init__(self,
+                 kavita_path,
+                 *,
+                 matches_cache: MatchesCache,
+                 language: str = "en",
+                 request_timeout: int = 30,
+                 api_base_url: str = "https://api.mangabaka.dev/v1",
+                 log_path=None,
+                 schedule: str = "0 19 * * 1,4"):
+        self._dst = Path(kavita_path)
+        self._matches_cache = matches_cache
+        self._language = language
+        self._timeout = request_timeout
+        self._api_base_url = api_base_url.rstrip("/")
+        self._log_path = (Path(log_path) if log_path
+                          else self._dst / "volume_updater.log")
+        self._cron = CronSchedule(schedule)
+
+        session = requests.Session()
+        session.headers.setdefault("User-Agent", "KavitaVolumeCoverUpdater/1.0")
+        _apply_mangabaka_rate_limit(session)
+        self._session = session
+
+        self._mal = MALResolver(request_timeout=request_timeout)
+        self._al  = AniListResolver(request_timeout=request_timeout)
+        self._vol_resolver = MangaDexVolumeResolver(
+            request_timeout=request_timeout, session=session)
+        self._works_resolver = MangaBakaWorksResolver(
+            api_base_url=api_base_url,
+            request_timeout=request_timeout, session=session)
+
+        self._stop = threading.Event()
+        self._thread: "threading.Thread | None" = None
+
+    # ------------------------------------------------------------------
+    # Cron API (mirrors SuwayomiFolderWatcher)
+    # ------------------------------------------------------------------
+    def start(self) -> None:
+        """Starts the periodic scan thread.  Non-blocking."""
+        if self._thread is not None and self._thread.is_alive():
+            return
+        self._stop.clear()
+        self._thread = threading.Thread(
+            target=self._loop, name="KavitaVolumeCoverUpdater", daemon=True)
+        self._thread.start()
+        print(f"[{_now()}] [updater] scanning {self._dst} "
+              f"on cron '{self._cron.expression}'", flush=True)
+
+    def stop(self) -> None:
+        """Stops the scan thread (current scan finishes its series first)."""
+        self._stop.set()
+        if self._thread is not None:
+            self._thread.join(timeout=10)
+
+    def wait(self) -> None:
+        """Blocks the calling thread until stop() is invoked."""
+        self._stop.wait()
+
+    def _loop(self) -> None:
+        while not self._stop.is_set():
+            next_run = self._cron.next_after(datetime.now())
+            wait = max(0.0, (next_run - datetime.now()).total_seconds())
+            print(f"[{_now()}] [updater] next scheduled scan: "
+                  f"{next_run.isoformat(timespec='minutes')}", flush=True)
+            if self._stop.wait(wait):
+                break
+
+            try:
+                summary = self.update_all()
+                print(f"[{_now()}] [updater] scan done: "
+                      f"{summary['series_updated']} series / "
+                      f"{summary['chapters_updated']} chapters updated",
+                      flush=True)
+            except Exception as exc:
+                print(f"[{_now()}] [updater] scan ERROR: {exc}", flush=True)
+
+    # ------------------------------------------------------------------
+    # Public scan API
+    # ------------------------------------------------------------------
+    def update_all(self) -> dict:
+        """
+        Scans every series folder under the Kavita root once.
+        Returns {"series_scanned": n, "series_updated": n, "chapters_updated": n}.
+        """
+        summary = {"series_scanned": 0, "series_updated": 0,
+                   "chapters_updated": 0}
+        if not self._dst.is_dir():
+            print(f"[updater] kavita path missing: {self._dst}", flush=True)
+            return summary
+
+        for series_dir in sorted(self._dst.iterdir()):
+            if self._stop.is_set():
+                break
+            if not series_dir.is_dir():
+                continue
+            summary["series_scanned"] += 1
+            try:
+                updated = self.update_series(series_dir)
+            except Exception as exc:
+                print(f"[updater] {series_dir.name}: ERROR {exc}", flush=True)
+                continue
+            if updated:
+                summary["series_updated"] += 1
+                summary["chapters_updated"] += updated
+        return summary
+
+    def update_series(self, series_dir: Path) -> int:
+        """
+        Updates one series folder.  Returns the number of updated chapters.
+
+        Only chapters listed in chapter_index.json with ``"volume": null``
+        are candidates; everything else costs no further host reads.
+        """
+        index = _load_chapter_index(series_dir)
+        chapters: dict = index["chapter"]
+        if not chapters:
+            return 0
+
+        missing = [num for num, e in chapters.items()
+                   if isinstance(e, dict) and e.get("volume") is None]
+        if not missing:
+            return 0
+
+        match_key, match = self._find_match_for_folder(series_dir.name)
+        if not match or not match.get("mangabakaId"):
+            print(f"[updater] {series_dir.name}: no matches.json entry — skip",
+                  flush=True)
+            return 0
+
+        # Builder resolves metadata via the cached MangaBaka ID and gives us
+        # the exact same chapter→volume logic the mover uses.
+        builder = ComicInfoBuilder(
+            match_key, chapter=missing[0],
+            api_base_url=self._api_base_url,
+            language=self._language,
+            request_timeout=self._timeout,
+            session=self._session,
+            volume_resolver=self._vol_resolver,
+            works_resolver=self._works_resolver,
+            mal_resolver=self._mal,
+            al_resolver=self._al,
+            matches_cache=self._matches_cache,
+        )
+        md = builder.fetch_metadata()
+        series_id = str(md.get("id") or "")
+
+        # Resolve volumes for all null-volume chapters first (API only).
+        updates: dict[str, dict] = {}   # num -> {"volume": str, "cover": tuple|None}
+        for num in sorted(missing, key=_chapter_sort_value):
+            builder.chapter = num
+            try:
+                volume = builder._determine_volume()
+            except Exception:
+                volume = None
+            if not volume:
+                continue
+            updates[num] = {"volume": volume,
+                            "cover": self._fetch_cover(series_id, volume)}
+
+        if not updates:
+            return 0
+
+        first = min(chapters, key=_chapter_sort_value)
+        updated = 0
+
+        for num, up in updates.items():
+            entry = chapters[num]
+            cbz = series_dir / (entry.get("archiveName") or "")
+            if not entry.get("archiveName") or not cbz.is_file():
+                print(f"[updater] {series_dir.name} ch.{num}: archive missing "
+                      f"({entry.get('archiveName')!r}) — skip", flush=True)
+                continue
+            # The first chapter gets a full metadata rebuild (Kavita reads
+            # series metadata from it); other chapters only a volume edit.
+            ok, cover_swapped = self._apply_update(
+                cbz, builder, num,
+                volume=up["volume"], cover=up["cover"],
+                full_rebuild=(num == first))
+            if not ok:
+                continue
+            entry["volume"] = _normalise_volume_value(up["volume"])
+            updated += 1
+            self._log(f"{series_dir.name} | chapter {num} -> volume "
+                      f"{up['volume']} | cover "
+                      f"{'replaced' if cover_swapped else 'kept'} | {cbz.name}")
+
+        # Refresh the first chapter's metadata when any other chapter changed
+        # (skip when it was already fully rebuilt in the loop above).
+        if updated and first not in updates:
+            first_entry = chapters.get(first) or {}
+            cbz = series_dir / (first_entry.get("archiveName") or "")
+            if first_entry.get("archiveName") and cbz.is_file():
+                ok, _ = self._apply_update(
+                    cbz, builder, first,
+                    volume=None, cover=None, full_rebuild=True)
+                if ok:
+                    self._log(f"{series_dir.name} | chapter {first} | "
+                              f"first-chapter metadata refreshed | {cbz.name}")
+
+        if updated:
+            _save_chapter_index(series_dir, index)
+        return updated
+
+    # ------------------------------------------------------------------
+    # Matching Kavita folder -> matches.json entry
+    # ------------------------------------------------------------------
+    def _find_match_for_folder(self, folder_name: str) -> tuple:
+        """
+        Maps a Kavita series folder back to its matches.json entry.
+
+        The folder was created as ``_sanitize_dirname(mangabaka_title)``, so
+        the comparison sanitizes each entry's mangabakaName the same way.
+        Falls back to the folderTitle (Suwayomi name) for robustness.
+        Returns (match_key, entry) or (None, None).
+        """
+        target = folder_name.strip().casefold()
+        matches = self._matches_cache.all()["matches"]
+        for key, entry in matches.items():
+            name = entry.get("mangabakaName") or ""
+            if name and _sanitize_dirname(name).strip().casefold() == target:
+                return key, entry
+        for key, entry in matches.items():
+            folder = entry.get("folderTitle") or key
+            if _sanitize_dirname(folder).strip().casefold() == target:
+                return key, entry
+        return None, None
+
+    # ------------------------------------------------------------------
+    # Cover download
+    # ------------------------------------------------------------------
+    def _fetch_cover(self, series_id: str, volume) -> "tuple[str, bytes] | None":
+        """
+        Downloads the MangaBaka volume cover.
+        Returns ("000<ext>", bytes) or None when no cover is available.
+        """
+        try:
+            url = self._works_resolver.get_cover_for_volume(series_id, volume)
+        except Exception:
+            url = None
+        if not url:
+            return None
+        try:
+            resp = self._session.get(url, timeout=self._timeout)
+            resp.raise_for_status()
+        except requests.RequestException:
+            return None
+        ext = _guess_extension(url, resp.headers.get("Content-Type", ""))
+        return (f"000{ext}", resp.content)
+
+    # ------------------------------------------------------------------
+    # Archive update (single read + single write per archive)
+    # ------------------------------------------------------------------
+    def _apply_update(self, cbz_path: Path, builder: ComicInfoBuilder,
+                      chapter_num: str, *,
+                      volume, cover, full_rebuild: bool) -> tuple:
+        """
+        Rewrites one CBZ archive with an updated ComicInfo.xml and (when
+        provided and a placeholder exists) a new cover image.
+
+        Returns (ok, cover_swapped).
+        """
+        try:
+            with zipfile.ZipFile(cbz_path, "r") as zin:
+                try:
+                    old_xml = zin.read("ComicInfo.xml")
+                except KeyError:
+                    old_xml = None
+
+                if full_rebuild or old_xml is None:
+                    new_xml = self._build_full_xml(
+                        builder, chapter_num, old_xml, cover)
+                else:
+                    new_xml = self._edit_volume_xml(old_xml, volume, cover)
+                    if new_xml is None:           # parse error -> full rebuild
+                        new_xml = self._build_full_xml(
+                            builder, chapter_num, None, cover)
+
+                infos = zin.infolist()
+                # Cover is only ever *replaced*: inserting one would shift
+                # every <Pages> image index in the existing XML.
+                has_placeholder = any(
+                    Path(i.filename).stem == "000"
+                    and Path(i.filename).suffix.lower() in _IMAGE_EXTS
+                    for i in infos)
+                swap_cover = cover is not None and has_placeholder
+
+                tmp = cbz_path.with_suffix(cbz_path.suffix + ".tmp")
+                wrote_xml = False
+                with zipfile.ZipFile(tmp, "w", zipfile.ZIP_STORED) as zout:
+                    for info in infos:
+                        p = Path(info.filename)
+                        if (swap_cover and p.stem == "000"
+                                and p.suffix.lower() in _IMAGE_EXTS):
+                            zout.writestr(cover[0], cover[1])
+                        elif info.filename == "ComicInfo.xml":
+                            zout.writestr("ComicInfo.xml", new_xml)
+                            wrote_xml = True
+                        else:
+                            zout.writestr(info, zin.read(info.filename))
+                    if not wrote_xml:
+                        zout.writestr("ComicInfo.xml", new_xml)
+            tmp.replace(cbz_path)
+            return True, swap_cover
+        except Exception as exc:
+            print(f"[updater] {cbz_path.name}: update failed: {exc}",
+                  flush=True)
+            return False, False
+
+    # ------------------------------------------------------------------
+    # XML builders
+    # ------------------------------------------------------------------
+    def _edit_volume_xml(self, old_xml: bytes, volume,
+                         cover) -> "str | None":
+        """
+        Sets <Volume> in an existing ComicInfo.xml and refreshes the
+        FrontCover page attributes when the cover gets replaced.
+        Returns None when the XML is unparseable.
+        """
+        try:
+            root = ET.fromstring(old_xml)
+        except ET.ParseError:
+            return None
+
+        el = root.find("Volume")
+        if el is None:
+            el = ET.SubElement(root, "Volume")
+        el.text = str(volume)
+
+        if cover is not None:
+            pages = root.find("Pages")
+            if pages is not None:
+                _update_page0_attrs(pages, cover[1])
+
+        return _serialize_tree(root)
+
+    def _build_full_xml(self, builder: ComicInfoBuilder, chapter_num: str,
+                        old_xml: "bytes | None", cover) -> str:
+        """
+        Rebuilds the complete ComicInfo.xml via ComicInfoBuilder (fresh
+        MangaBaka/MAL metadata).  Suwayomi-derived fields and the <Pages>
+        section are carried over from the previous XML.
+        """
+        builder.chapter = chapter_num   # also clears builder page state
+        builder._suwayomi_data = (
+            ComicInfoBuilder.read_comicinfo_fields(old_xml) if old_xml else {})
+        root = builder._build_tree().getroot()
+
+        if old_xml:
+            try:
+                old_root = ET.fromstring(old_xml)
+            except ET.ParseError:
+                old_root = None
+            if old_root is not None:
+                pages = old_root.find("Pages")
+                if pages is not None and cover is not None:
+                    _update_page0_attrs(pages, cover[1])
+                page_count = old_root.find("PageCount")
+                if page_count is not None:
+                    root.append(page_count)
+                if pages is not None:
+                    root.append(pages)
+
+        return _serialize_tree(root)
+
+    # ------------------------------------------------------------------
+    # Logging
+    # ------------------------------------------------------------------
+    def _log(self, msg: str) -> None:
+        line = f"[{_now()}] {msg}"
+        print(f"[updater] {msg}", flush=True)
+        try:
+            self._log_path.parent.mkdir(parents=True, exist_ok=True)
+            with self._log_path.open("a", encoding="utf-8") as f:
+                f.write(line + "\n")
+        except OSError as exc:
+            print(f"[updater] cannot write log file {self._log_path}: {exc}",
+                  flush=True)
+
+
+# --------------------------------------------------------------------------
+# Usage example
+# --------------------------------------------------------------------------
+if __name__ == "__main__":
+    # Local (no-Docker) smoke test.  Adjust paths to your environment.
+    KAVITA_PATH  = r"\\192.168.2.2\root\ServerData\Kavita\test"
+    MATCHES_PATH = Path(__file__).resolve().parent.parent / "matches.json"
+
+    updater = KavitaVolumeCoverUpdater(
+        KAVITA_PATH,
+        matches_cache=MatchesCache(MATCHES_PATH),
+    )
+
+    # One-shot scan (no cron thread):
+    summary = updater.update_all()
+    print(f"\n[updater] {summary}")
+
+    # Or run on the cron schedule (default: 19:00 every Mon + Thu):
+    # updater.start()
+    # updater.wait()