Performance Improvements

2026-06-16 11:37:47 +02:00
parent b6d7f2d0af
commit a59cff3951
5 changed files with 102 additions and 32 deletions
@@ -44,7 +44,6 @@ from __future__ import annotations
 import os
 import sys
 from pathlib import Path
-
 try:
    from dotenv import load_dotenv
    load_dotenv()
@@ -107,7 +106,7 @@ def main() -> int:
    web_host        = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0"
    web_port        = _env_int("WEB_PORT", 8080)
    updater_enabled  = _env_bool("UPDATER_ENABLED", True)
-    updater_schedule = _env_str("UPDATER_SCHEDULE", "0 19 * * 1,4")
+    updater_schedule = _env_str("UPDATER_SCHEDULE", "0 10 * * 1,4")
    updater_log      = _env_str("UPDATER_LOG", "/config/volume_updater.log")
    cover_cache_path = _env_str("COVER_CACHE_PATH", "") or None
    perf_path        = _env_str("PERF_PATH", "/config/perf_stats.json") or None
@@ -151,9 +151,10 @@ class MangaBakaWorksResolver:
        Returns volume-level works for a series, filtered to those that have
        a usable cover image.

-        Non-empty results are cached per series; empty results are not, so
-        works added on MangaBaka later become visible without restarting
-        the (long-running) process.
+        Results are cached per series — including empty results, so a series
+        without works is not re-paginated for every chapter of a move run.
+        The periodic cover updater calls clear_cache() before each scan, so
+        works added on MangaBaka later are still picked up there.
        """
        if not series_id:
            return []
@@ -165,8 +166,7 @@ class MangaBakaWorksResolver:

        # Discard works that carry no usable cover
        works_with_cover = [w for w in all_works if w.get("images")]
-        if works_with_cover:
-            self._cache[series_id] = works_with_cover
+        self._cache[series_id] = works_with_cover
        return works_with_cover

    def get_work_for_volume(self, series_id: str, volume) -> "dict | None":
@@ -228,10 +228,10 @@ class MangaBakaWorksResolver:
            if url:
                result[norm] = url

-        # Empty results are not cached — covers added on MangaBaka later
-        # become visible without restarting the long-running process.
-        if result:
-            self._images_cache[series_id] = result
+        # Cache even an empty result so a series without volume images is not
+        # re-paginated for every chapter.  The periodic cover updater clears
+        # this cache before each scan, so newly added images are still found.
+        self._images_cache[series_id] = result
        return result

    def get_cover_for_volume_from_images(self, series_id: str,
@@ -72,6 +72,10 @@ def _no_measure():
    yield


+# Sentinel marking a per-chapter memo slot as "not computed yet".
+_UNSET = object()
+
+
 # --------------------------------------------------------------------------
 # Constants
 # --------------------------------------------------------------------------
@@ -235,6 +239,12 @@ class ComicInfoBuilder:
        self._pages: list[dict] = []
        self._cover_path: "Path | None" = None
        self._suwayomi_data: dict = {}
+        # Per-chapter memo for _determine_volume (resolved up to 3x/chapter
+        # otherwise: cover download, explicit volume step, XML build).
+        self._volume_memo = _UNSET
+        # Per-series cache for full series fetches by id (parent series for
+        # SeriesGroup, merged-series redirects) — reused across all chapters.
+        self._series_by_id_cache: dict[str, dict] = {}

    # ----- Repr -----------------------------------------------------------
    def __repr__(self) -> str:
@@ -274,6 +284,7 @@ class ComicInfoBuilder:
        self._pages = []
        self._cover_path = None
        self._suwayomi_data = {}
+        self._volume_memo = _UNSET

    def _measure(self, name: str):
        """Times a named step on the attached recorder; no-op when unset."""
@@ -437,12 +448,20 @@ class ComicInfoBuilder:
        return series

    def _fetch_series_by_id(self, series_id) -> dict:
+        # Cached per builder (i.e. per series): SeriesGroup resolution calls
+        # this for the parent on every chapter — without the cache that is
+        # one MangaBaka request per chapter for the same parent id.
+        key = str(series_id)
+        cached = self._series_by_id_cache.get(key)
+        if cached is not None:
+            return cached
        url = f"{self.api_base_url}/series/{series_id}"
        resp = self._session.get(url, timeout=self.request_timeout)
        resp.raise_for_status()
        data = resp.json().get("data")
        if not data:
            raise RuntimeError(f"Series with ID {series_id} not found.")
+        self._series_by_id_cache[key] = data
        return data

    # ======================================================================
@@ -578,6 +597,18 @@ class ComicInfoBuilder:
    # Volume determination
    # ======================================================================
    def _determine_volume(self) -> "str | None":
+        """
+        Resolves the volume for the current chapter, memoized per chapter.
+
+        The result is reused across the three call sites per chapter (cover
+        download, explicit volume step, XML build); the memo is cleared
+        whenever the chapter or manga title changes (see _clear_results).
+        """
+        if self._volume_memo is _UNSET:
+            self._volume_memo = self._resolve_volume()
+        return self._volume_memo
+
+    def _resolve_volume(self) -> "str | None":
        """
        Resolves the volume for the current chapter via MangaDex.
        Falls back to estimation when the chapter is absent from MangaDex.
@@ -93,6 +93,9 @@ class MangaDexVolumeResolver:
        self._cache: dict[str, dict] = {}
        # Cache: manga_id -> {relation_type: [title, ...]}
        self._relations_cache: dict[str, dict] = {}
+        # Cache: title_lower -> manga_id (or None) — avoids repeating the
+        # MangaDex search for every chapter of the same series.
+        self._id_cache: dict[str, "str | None"] = {}

    # ----------------------------------------------------------------------
    # Locate the manga ID
@@ -105,15 +108,25 @@ class MangaDexVolumeResolver:
        if not title or not title.strip():
            return None

-        resp = self._session.get(
-            f"{self.base_url}/manga",
-            params={"title": title, "limit": 5,
-                    "contentRating[]": ["safe", "suggestive",
-                                        "erotica", "pornographic"]},
-            timeout=self.request_timeout)
-        resp.raise_for_status()
-        results = resp.json().get("data") or []
+        key = title.strip().lower()
+        if key in self._id_cache:
+            return self._id_cache[key]
+
+        try:
+            resp = self._session.get(
+                f"{self.base_url}/manga",
+                params={"title": title, "limit": 5,
+                        "contentRating[]": ["safe", "suggestive",
+                                            "erotica", "pornographic"]},
+                timeout=self.request_timeout)
+            resp.raise_for_status()
+            results = resp.json().get("data") or []
+        except requests.RequestException:
+            # Don't cache transient failures — allow a retry next time.
+            return None
+
        if not results:
+            self._id_cache[key] = None
            return None

        def score(entry) -> float:
@@ -130,7 +143,9 @@ class MangaDexVolumeResolver:
            return best

        results.sort(key=score, reverse=True)
-        return results[0].get("id")
+        manga_id = results[0].get("id")
+        self._id_cache[key] = manga_id
+        return manga_id

    # ----------------------------------------------------------------------
    # Main function: retrieve and return volume / chapter data
@@ -63,6 +63,7 @@ from __future__ import annotations
 import json
 import threading
 import time
+import uuid
 from contextlib import contextmanager
 from pathlib import Path

@@ -143,6 +144,9 @@ class SeriesRecorder(_StepTimer):
            "steps":        self.steps,
            "chapters":     self._chapters,
        })
+        # Persist the run's progress after every series so a long run is
+        # observable live and survives a crash mid-run.
+        self._run.flush()


 class RunRecorder:
@@ -154,15 +158,15 @@ class RunRecorder:
        self._series: list[dict] = []
        self._started = time.time()
        self._t0 = time.monotonic()
+        # Stable identity so incremental flushes update the same run entry
+        # instead of inserting a duplicate on every series.
+        self._run_id = uuid.uuid4().hex

    def begin_series(self, title: str) -> SeriesRecorder:
        return SeriesRecorder(self, title, enabled=self._enabled)

-    def finish(self) -> dict | None:
-        """Aggregates the run and persists it.  Returns the run dict."""
-        if not self._enabled:
-            return None
-
+    def _snapshot(self) -> dict:
+        """Aggregates the run's current state into a serialisable dict."""
        step_totals: dict[str, float] = {}
        series_step_totals: dict[str, float] = {}
        chapter_count = 0
@@ -176,7 +180,8 @@ class RunRecorder:
                    step_totals[step] = round(
                        step_totals.get(step, 0.0) + secs, 4)

-        run = {
+        return {
+            "runId":             self._run_id,
            "startedAt":         round(self._started),
            "finishedAt":        round(time.time()),
            "totalSeconds":      round(time.monotonic() - self._t0, 4),
@@ -186,9 +191,19 @@ class RunRecorder:
            "seriesStepTotals":  series_step_totals,
            "series":            self._series,
        }
-        self._stats._append_run(run)
+
+    def flush(self) -> dict | None:
+        """Writes the run's current state to disk (upsert by runId)."""
+        if not self._enabled:
+            return None
+        run = self._snapshot()
+        self._stats._upsert_run(run)
        return run

+    def finish(self) -> dict | None:
+        """Persists the final run state.  Returns the run dict."""
+        return self.flush()
+

 class PerfStats:
    """
@@ -227,14 +242,24 @@ class PerfStats:
            return {"runs": []}
        return data

-    def _append_run(self, run: dict) -> None:
+    def _upsert_run(self, run: dict) -> None:
+        """
+        Inserts a new run (newest first) or replaces the existing entry with
+        the same runId — so incremental flushes during a run update one entry
+        rather than appending a duplicate after every series.
+        """
        if not self._path:
            return
        with self._lock:
-            data = self.all()
-            runs = data["runs"]
-            runs.insert(0, run)             # newest first
-            del runs[_MAX_RUNS:]            # cap history
+            runs = self.all()["runs"]
+            run_id = run.get("runId")
+            for i, existing in enumerate(runs):
+                if existing.get("runId") == run_id:
+                    runs[i] = run
+                    break
+            else:
+                runs.insert(0, run)         # newest first
+                del runs[_MAX_RUNS:]        # cap history
            self._path.parent.mkdir(parents=True, exist_ok=True)
            tmp = self._path.with_suffix(self._path.suffix + ".tmp")
            with tmp.open("w", encoding="utf-8") as f: