Performance Improvements

This commit is contained in:
2026-06-16 11:37:47 +02:00
parent b6d7f2d0af
commit a59cff3951
5 changed files with 102 additions and 32 deletions
+1 -2
View File
@@ -44,7 +44,6 @@ from __future__ import annotations
import os import os
import sys import sys
from pathlib import Path from pathlib import Path
try: try:
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
@@ -107,7 +106,7 @@ def main() -> int:
web_host = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0" web_host = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0"
web_port = _env_int("WEB_PORT", 8080) web_port = _env_int("WEB_PORT", 8080)
updater_enabled = _env_bool("UPDATER_ENABLED", True) updater_enabled = _env_bool("UPDATER_ENABLED", True)
updater_schedule = _env_str("UPDATER_SCHEDULE", "0 19 * * 1,4") updater_schedule = _env_str("UPDATER_SCHEDULE", "0 10 * * 1,4")
updater_log = _env_str("UPDATER_LOG", "/config/volume_updater.log") updater_log = _env_str("UPDATER_LOG", "/config/volume_updater.log")
cover_cache_path = _env_str("COVER_CACHE_PATH", "") or None cover_cache_path = _env_str("COVER_CACHE_PATH", "") or None
perf_path = _env_str("PERF_PATH", "/config/perf_stats.json") or None perf_path = _env_str("PERF_PATH", "/config/perf_stats.json") or None
+9 -9
View File
@@ -151,9 +151,10 @@ class MangaBakaWorksResolver:
Returns volume-level works for a series, filtered to those that have Returns volume-level works for a series, filtered to those that have
a usable cover image. a usable cover image.
Non-empty results are cached per series; empty results are not, so Results are cached per series — including empty results, so a series
works added on MangaBaka later become visible without restarting without works is not re-paginated for every chapter of a move run.
the (long-running) process. The periodic cover updater calls clear_cache() before each scan, so
works added on MangaBaka later are still picked up there.
""" """
if not series_id: if not series_id:
return [] return []
@@ -165,8 +166,7 @@ class MangaBakaWorksResolver:
# Discard works that carry no usable cover # Discard works that carry no usable cover
works_with_cover = [w for w in all_works if w.get("images")] works_with_cover = [w for w in all_works if w.get("images")]
if works_with_cover: self._cache[series_id] = works_with_cover
self._cache[series_id] = works_with_cover
return works_with_cover return works_with_cover
def get_work_for_volume(self, series_id: str, volume) -> "dict | None": def get_work_for_volume(self, series_id: str, volume) -> "dict | None":
@@ -228,10 +228,10 @@ class MangaBakaWorksResolver:
if url: if url:
result[norm] = url result[norm] = url
# Empty results are not cached — covers added on MangaBaka later # Cache even an empty result so a series without volume images is not
# become visible without restarting the long-running process. # re-paginated for every chapter. The periodic cover updater clears
if result: # this cache before each scan, so newly added images are still found.
self._images_cache[series_id] = result self._images_cache[series_id] = result
return result return result
def get_cover_for_volume_from_images(self, series_id: str, def get_cover_for_volume_from_images(self, series_id: str,
+31
View File
@@ -72,6 +72,10 @@ def _no_measure():
yield yield
# Sentinel marking a per-chapter memo slot as "not computed yet".
_UNSET = object()
# -------------------------------------------------------------------------- # --------------------------------------------------------------------------
# Constants # Constants
# -------------------------------------------------------------------------- # --------------------------------------------------------------------------
@@ -235,6 +239,12 @@ class ComicInfoBuilder:
self._pages: list[dict] = [] self._pages: list[dict] = []
self._cover_path: "Path | None" = None self._cover_path: "Path | None" = None
self._suwayomi_data: dict = {} self._suwayomi_data: dict = {}
# Per-chapter memo for _determine_volume (resolved up to 3x/chapter
# otherwise: cover download, explicit volume step, XML build).
self._volume_memo = _UNSET
# Per-series cache for full series fetches by id (parent series for
# SeriesGroup, merged-series redirects) — reused across all chapters.
self._series_by_id_cache: dict[str, dict] = {}
# ----- Repr ----------------------------------------------------------- # ----- Repr -----------------------------------------------------------
def __repr__(self) -> str: def __repr__(self) -> str:
@@ -274,6 +284,7 @@ class ComicInfoBuilder:
self._pages = [] self._pages = []
self._cover_path = None self._cover_path = None
self._suwayomi_data = {} self._suwayomi_data = {}
self._volume_memo = _UNSET
def _measure(self, name: str): def _measure(self, name: str):
"""Times a named step on the attached recorder; no-op when unset.""" """Times a named step on the attached recorder; no-op when unset."""
@@ -437,12 +448,20 @@ class ComicInfoBuilder:
return series return series
def _fetch_series_by_id(self, series_id) -> dict: def _fetch_series_by_id(self, series_id) -> dict:
# Cached per builder (i.e. per series): SeriesGroup resolution calls
# this for the parent on every chapter — without the cache that is
# one MangaBaka request per chapter for the same parent id.
key = str(series_id)
cached = self._series_by_id_cache.get(key)
if cached is not None:
return cached
url = f"{self.api_base_url}/series/{series_id}" url = f"{self.api_base_url}/series/{series_id}"
resp = self._session.get(url, timeout=self.request_timeout) resp = self._session.get(url, timeout=self.request_timeout)
resp.raise_for_status() resp.raise_for_status()
data = resp.json().get("data") data = resp.json().get("data")
if not data: if not data:
raise RuntimeError(f"Series with ID {series_id} not found.") raise RuntimeError(f"Series with ID {series_id} not found.")
self._series_by_id_cache[key] = data
return data return data
# ====================================================================== # ======================================================================
@@ -578,6 +597,18 @@ class ComicInfoBuilder:
# Volume determination # Volume determination
# ====================================================================== # ======================================================================
def _determine_volume(self) -> "str | None": def _determine_volume(self) -> "str | None":
"""
Resolves the volume for the current chapter, memoized per chapter.
The result is reused across the three call sites per chapter (cover
download, explicit volume step, XML build); the memo is cleared
whenever the chapter or manga title changes (see _clear_results).
"""
if self._volume_memo is _UNSET:
self._volume_memo = self._resolve_volume()
return self._volume_memo
def _resolve_volume(self) -> "str | None":
""" """
Resolves the volume for the current chapter via MangaDex. Resolves the volume for the current chapter via MangaDex.
Falls back to estimation when the chapter is absent from MangaDex. Falls back to estimation when the chapter is absent from MangaDex.
+24 -9
View File
@@ -93,6 +93,9 @@ class MangaDexVolumeResolver:
self._cache: dict[str, dict] = {} self._cache: dict[str, dict] = {}
# Cache: manga_id -> {relation_type: [title, ...]} # Cache: manga_id -> {relation_type: [title, ...]}
self._relations_cache: dict[str, dict] = {} self._relations_cache: dict[str, dict] = {}
# Cache: title_lower -> manga_id (or None) — avoids repeating the
# MangaDex search for every chapter of the same series.
self._id_cache: dict[str, "str | None"] = {}
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
# Locate the manga ID # Locate the manga ID
@@ -105,15 +108,25 @@ class MangaDexVolumeResolver:
if not title or not title.strip(): if not title or not title.strip():
return None return None
resp = self._session.get( key = title.strip().lower()
f"{self.base_url}/manga", if key in self._id_cache:
params={"title": title, "limit": 5, return self._id_cache[key]
"contentRating[]": ["safe", "suggestive",
"erotica", "pornographic"]}, try:
timeout=self.request_timeout) resp = self._session.get(
resp.raise_for_status() f"{self.base_url}/manga",
results = resp.json().get("data") or [] params={"title": title, "limit": 5,
"contentRating[]": ["safe", "suggestive",
"erotica", "pornographic"]},
timeout=self.request_timeout)
resp.raise_for_status()
results = resp.json().get("data") or []
except requests.RequestException:
# Don't cache transient failures — allow a retry next time.
return None
if not results: if not results:
self._id_cache[key] = None
return None return None
def score(entry) -> float: def score(entry) -> float:
@@ -130,7 +143,9 @@ class MangaDexVolumeResolver:
return best return best
results.sort(key=score, reverse=True) results.sort(key=score, reverse=True)
return results[0].get("id") manga_id = results[0].get("id")
self._id_cache[key] = manga_id
return manga_id
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
# Main function: retrieve and return volume / chapter data # Main function: retrieve and return volume / chapter data
+37 -12
View File
@@ -63,6 +63,7 @@ from __future__ import annotations
import json import json
import threading import threading
import time import time
import uuid
from contextlib import contextmanager from contextlib import contextmanager
from pathlib import Path from pathlib import Path
@@ -143,6 +144,9 @@ class SeriesRecorder(_StepTimer):
"steps": self.steps, "steps": self.steps,
"chapters": self._chapters, "chapters": self._chapters,
}) })
# Persist the run's progress after every series so a long run is
# observable live and survives a crash mid-run.
self._run.flush()
class RunRecorder: class RunRecorder:
@@ -154,15 +158,15 @@ class RunRecorder:
self._series: list[dict] = [] self._series: list[dict] = []
self._started = time.time() self._started = time.time()
self._t0 = time.monotonic() self._t0 = time.monotonic()
# Stable identity so incremental flushes update the same run entry
# instead of inserting a duplicate on every series.
self._run_id = uuid.uuid4().hex
def begin_series(self, title: str) -> SeriesRecorder: def begin_series(self, title: str) -> SeriesRecorder:
return SeriesRecorder(self, title, enabled=self._enabled) return SeriesRecorder(self, title, enabled=self._enabled)
def finish(self) -> dict | None: def _snapshot(self) -> dict:
"""Aggregates the run and persists it. Returns the run dict.""" """Aggregates the run's current state into a serialisable dict."""
if not self._enabled:
return None
step_totals: dict[str, float] = {} step_totals: dict[str, float] = {}
series_step_totals: dict[str, float] = {} series_step_totals: dict[str, float] = {}
chapter_count = 0 chapter_count = 0
@@ -176,7 +180,8 @@ class RunRecorder:
step_totals[step] = round( step_totals[step] = round(
step_totals.get(step, 0.0) + secs, 4) step_totals.get(step, 0.0) + secs, 4)
run = { return {
"runId": self._run_id,
"startedAt": round(self._started), "startedAt": round(self._started),
"finishedAt": round(time.time()), "finishedAt": round(time.time()),
"totalSeconds": round(time.monotonic() - self._t0, 4), "totalSeconds": round(time.monotonic() - self._t0, 4),
@@ -186,9 +191,19 @@ class RunRecorder:
"seriesStepTotals": series_step_totals, "seriesStepTotals": series_step_totals,
"series": self._series, "series": self._series,
} }
self._stats._append_run(run)
def flush(self) -> dict | None:
"""Writes the run's current state to disk (upsert by runId)."""
if not self._enabled:
return None
run = self._snapshot()
self._stats._upsert_run(run)
return run return run
def finish(self) -> dict | None:
"""Persists the final run state. Returns the run dict."""
return self.flush()
class PerfStats: class PerfStats:
""" """
@@ -227,14 +242,24 @@ class PerfStats:
return {"runs": []} return {"runs": []}
return data return data
def _append_run(self, run: dict) -> None: def _upsert_run(self, run: dict) -> None:
"""
Inserts a new run (newest first) or replaces the existing entry with
the same runId — so incremental flushes during a run update one entry
rather than appending a duplicate after every series.
"""
if not self._path: if not self._path:
return return
with self._lock: with self._lock:
data = self.all() runs = self.all()["runs"]
runs = data["runs"] run_id = run.get("runId")
runs.insert(0, run) # newest first for i, existing in enumerate(runs):
del runs[_MAX_RUNS:] # cap history if existing.get("runId") == run_id:
runs[i] = run
break
else:
runs.insert(0, run) # newest first
del runs[_MAX_RUNS:] # cap history
self._path.parent.mkdir(parents=True, exist_ok=True) self._path.parent.mkdir(parents=True, exist_ok=True)
tmp = self._path.with_suffix(self._path.suffix + ".tmp") tmp = self._path.with_suffix(self._path.suffix + ".tmp")
with tmp.open("w", encoding="utf-8") as f: with tmp.open("w", encoding="utf-8") as f: