From b0692a652784aa5f188bae28ef7574fdfd59fff9 Mon Sep 17 00:00:00 2001 From: JohannesBOT Date: Mon, 15 Jun 2026 11:23:20 +0200 Subject: [PATCH 1/4] time measurement --- .env.example | 1 + docker-compose.prod.yml | 2 + main_manga.py | 9 ++- src/manga/ComicInfoBuilder.py | 56 ++++++++++---- src/manga/MatchesWebApp.py | 142 ++++++++++++++++++++++++++++++++++ src/manga/SuwayomiMover.py | 71 ++++++++++++----- 6 files changed, 244 insertions(+), 37 deletions(-) diff --git a/.env.example b/.env.example index e3b464f..13eb8fb 100644 --- a/.env.example +++ b/.env.example @@ -14,6 +14,7 @@ DELETE_SOURCE=true UPDATER_ENABLED=true UPDATER_SCHEDULE=0 19 * * 1,4 COVER_CACHE_PATH=/config/covers +PERF_PATH=/config/perf_stats.json # Light-novel container (kavita-lightnovel-metadata-fetcher) HOST_LN_CONFIG_PATH=/path/to/ln-config diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index bde59bb..4d03f31 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -21,6 +21,8 @@ services: UPDATER_LOG: "/config/volume_updater.log" # Persistent cover cache (empty = temp dir, deleted on container stop) COVER_CACHE_PATH: "${COVER_CACHE_PATH:-/config/covers}" + # Per-step move timing stats (viewable at /perf); empty disables it + PERF_PATH: "${PERF_PATH:-/config/perf_stats.json}" # Timezone for the cron schedule — without this 19:00 means 19:00 UTC TZ: "${TZ:-Europe/Berlin}" ports: diff --git a/main_manga.py b/main_manga.py index a055700..6210494 100644 --- a/main_manga.py +++ b/main_manga.py @@ -35,6 +35,8 @@ Environment variables UPDATER_LOG default /config/volume_updater.log COVER_CACHE_PATH directory for the persistent cover cache; empty (default) = temporary cache, deleted on exit + PERF_PATH JSON file for per-step move timing stats; + empty disables profiling. Default /config/perf_stats.json """ from __future__ import annotations @@ -61,6 +63,7 @@ from SuwayomiFolderWatcher import SuwayomiFolderWatcher # noqa: E402,F401 from MatchesCache import MatchesCache # noqa: E402 from MatchesWebApp import MatchesWebApp # noqa: E402 from KavitaVolumeCoverUpdater import KavitaVolumeCoverUpdater # noqa: E402 +from PerfStats import PerfStats # noqa: E402 def _env_str(name: str, default: "str | None" = None, @@ -107,6 +110,7 @@ def main() -> int: updater_schedule = _env_str("UPDATER_SCHEDULE", "0 19 * * 1,4") updater_log = _env_str("UPDATER_LOG", "/config/volume_updater.log") cover_cache_path = _env_str("COVER_CACHE_PATH", "") or None + perf_path = _env_str("PERF_PATH", "/config/perf_stats.json") or None print(f"[main] suwayomi = {suwayomi_path}", flush=True) print(f"[main] kavita = {kavita_path}", flush=True) @@ -118,6 +122,7 @@ def main() -> int: print(f"[main] web = {web_host}:{web_port}", flush=True) matches_cache = MatchesCache(match_path) + perf_stats = PerfStats(perf_path) mover = SuwayomiMover( suwayomi_path, kavita_path, @@ -128,11 +133,13 @@ def main() -> int: delete_source=delete_source, matches_cache=matches_cache, cover_cache_dir=cover_cache_path, + perf_stats=perf_stats, ) # watcher = SuwayomiFolderWatcher(suwayomi_path, mover, settle_seconds=settle_seconds) - web_app = MatchesWebApp(matches_cache, mover=mover, host=web_host, port=web_port) + web_app = MatchesWebApp(matches_cache, mover=mover, perf_stats=perf_stats, + host=web_host, port=web_port) web_app.start() if updater_enabled: diff --git a/src/manga/ComicInfoBuilder.py b/src/manga/ComicInfoBuilder.py index d337f36..c6f9abf 100644 --- a/src/manga/ComicInfoBuilder.py +++ b/src/manga/ComicInfoBuilder.py @@ -40,6 +40,7 @@ from __future__ import annotations import re import sys import xml.etree.ElementTree as ET +from contextlib import contextmanager from pathlib import Path import requests @@ -65,6 +66,12 @@ except ImportError: _HAS_PIL = False +@contextmanager +def _no_measure(): + """No-op stand-in for a perf recorder's measure() context manager.""" + yield + + # -------------------------------------------------------------------------- # Constants # -------------------------------------------------------------------------- @@ -218,6 +225,12 @@ class ComicInfoBuilder: self._matches_cache = matches_cache self._cover_cache = cover_cache or _default_cover_cache() + # Optional performance recorder (duck-typed: any object with a + # .measure(name) context manager). The mover sets this per chapter; + # when None, _measure() is a no-op so the builder stays decoupled + # from PerfStats and works standalone (e.g. the cover updater). + self.perf = None + self._metadata: "dict | None" = None self._pages: list[dict] = [] self._cover_path: "Path | None" = None @@ -262,6 +275,12 @@ class ComicInfoBuilder: self._cover_path = None self._suwayomi_data = {} + def _measure(self, name: str): + """Times a named step on the attached recorder; no-op when unset.""" + if self.perf is not None: + return self.perf.measure(name) + return _no_measure() + # ====================================================================== # Public XML functions # ====================================================================== @@ -305,11 +324,13 @@ class ComicInfoBuilder: if not folder.is_dir(): raise NotADirectoryError(f"Folder not found: {folder}") - self._suwayomi_data = self._read_existing_comicinfo(folder) + with self._measure("read_comicinfo"): + self._suwayomi_data = self._read_existing_comicinfo(folder) self._cover_path = None if download_cover: - self._cover_path = self._download_cover(folder, cover_filename) + with self._measure("cover"): + self._cover_path = self._download_cover(folder, cover_filename) cover_resolved = self._cover_path.resolve() if self._cover_path else None story_images: list[Path] = [] @@ -329,20 +350,23 @@ class ComicInfoBuilder: ordered.extend((img, "Story") for img in story_images) self._pages = [] - for index, (img_path, page_type) in enumerate(ordered): - width, height = self._image_dimensions(img_path) - try: - size = img_path.stat().st_size - except OSError: - size = None - self._pages.append({ - "image": index, - "type": page_type, - "width": width, - "height": height, - "size": size, - "double": bool(width and height and width > height), - }) + # Probing every page for its pixel dimensions reads each file — on a + # network share this is often the dominant per-chapter cost. + with self._measure("image_dimensions"): + for index, (img_path, page_type) in enumerate(ordered): + width, height = self._image_dimensions(img_path) + try: + size = img_path.stat().st_size + except OSError: + size = None + self._pages.append({ + "image": index, + "type": page_type, + "width": width, + "height": height, + "size": size, + "double": bool(width and height and width > height), + }) return { "page_count": len(self._pages), diff --git a/src/manga/MatchesWebApp.py b/src/manga/MatchesWebApp.py index b4734c3..a90639c 100644 --- a/src/manga/MatchesWebApp.py +++ b/src/manga/MatchesWebApp.py @@ -71,6 +71,7 @@ _INDEX_HTML = """ + Performance ▸ @@ -357,6 +358,135 @@ load(); """ +_PERF_HTML = """ + + + + Move performance + + + +

Move performance ◂ back to matches

+
+ + + +
+ +
+ + + + +""" + + class MatchesWebApp: """ Flask app exposing the MatchesCache. `mover` is required when you want @@ -367,10 +497,12 @@ class MatchesWebApp: def __init__(self, cache: MatchesCache, *, mover=None, + perf_stats=None, host: str = "0.0.0.0", port: int = 8080): self._cache = cache self._mover = mover + self._perf = perf_stats self._host = host self._port = port self._build_lock = threading.Lock() @@ -498,3 +630,13 @@ class MatchesWebApp: finally: self._move_lock.release() return jsonify({"results": results}) + + @app.get("/perf") + def perf_page() -> Response: + return Response(_PERF_HTML, mimetype="text/html; charset=utf-8") + + @app.get("/api/perf") + def api_perf(): + if self._perf is None: + return jsonify({"runs": []}) + return jsonify(self._perf.all()) diff --git a/src/manga/SuwayomiMover.py b/src/manga/SuwayomiMover.py index 51ae101..1c35f73 100644 --- a/src/manga/SuwayomiMover.py +++ b/src/manga/SuwayomiMover.py @@ -69,6 +69,7 @@ from KavitaPersonUpdater import KavitaPersonUpdater from MatchesCache import MatchesCache from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit from CoverCache import CoverCache, _IMAGE_EXTS +from PerfStats import PerfStats _CHAPTER_RE = re.compile(r'[Cc]hapter\s+(\d+(?:\.\d+)?)') @@ -313,6 +314,8 @@ class SuwayomiMover: delete_source : Remove the source chapter folder after successful pack. cover_cache_dir : Directory for the persistent cover cache. None -> temporary cache, deleted at process exit. + perf_stats : Optional PerfStats instance for per-step timing. None + (default) disables profiling. """ def __init__(self, @@ -326,7 +329,8 @@ class SuwayomiMover: delete_source: bool = True, matches_cache: "MatchesCache | None" = None, api_base_url: str = "https://api.mangabaka.dev/v1", - cover_cache_dir=None): + cover_cache_dir=None, + perf_stats: "PerfStats | None" = None): self._src = Path(suwayomi_path) self._dst = Path(kavita_path) self._language = language @@ -334,6 +338,7 @@ class SuwayomiMover: self._delete_source = delete_source self._matches_cache = matches_cache self._api_base_url = api_base_url.rstrip("/") + self._perf = perf_stats or PerfStats(None) # Shared HTTP session and resolvers — reused across all series/chapters # to maximise cache hits and minimise API round-trips. @@ -376,15 +381,19 @@ class SuwayomiMover: dict from _process_series_dir. """ results: dict = {} - for source_dir in sorted(self._src.iterdir()): - if not source_dir.is_dir(): - continue - for manga_dir in sorted(source_dir.iterdir()): - if not manga_dir.is_dir(): + run = self._perf.begin_run() + try: + for source_dir in sorted(self._src.iterdir()): + if not source_dir.is_dir(): continue - title = manga_dir.name - print(f"[SuwayomiMover] {title}") - results[title] = self._process_series_dir(manga_dir) + for manga_dir in sorted(source_dir.iterdir()): + if not manga_dir.is_dir(): + continue + title = manga_dir.name + print(f"[SuwayomiMover] {title}") + results[title] = self._process_series_dir(manga_dir, run) + finally: + run.finish() return results def process_series(self, manga_title: str) -> dict: @@ -400,7 +409,11 @@ class SuwayomiMover: continue candidate = source_dir / manga_title if candidate.is_dir(): - return self._process_series_dir(candidate) + run = self._perf.begin_run() + try: + return self._process_series_dir(candidate, run) + finally: + run.finish() raise FileNotFoundError( f"No Suwayomi directory found for '{manga_title}' under {self._src}") @@ -487,8 +500,9 @@ class SuwayomiMover: # ------------------------------------------------------------------ # Internal: series # ------------------------------------------------------------------ - def _process_series_dir(self, manga_dir: Path) -> dict: + def _process_series_dir(self, manga_dir: Path, run=None) -> dict: manga_title = manga_dir.name + series_rec = (run or self._perf.begin_run()).begin_series(manga_title) chapter_dirs = sorted( (d for d in manga_dir.iterdir() if d.is_dir()), @@ -539,7 +553,8 @@ class SuwayomiMover: md: "dict | None" = None mangabaka_title = manga_title try: - md = builder.fetch_metadata() + with series_rec.measure("fetch_metadata"): + md = builder.fetch_metadata() mangabaka_title = md.get("title") or manga_title except Exception as exc: print(f" [warn] metadata fetch failed: {exc}") @@ -571,7 +586,7 @@ class SuwayomiMover: chapter_results: list[dict] = [] for chapter_dir, _fields, chapter_num in pending: result = self._process_chapter( - builder, chapter_num, chapter_dir, dest_series) + builder, chapter_num, chapter_dir, dest_series, series_rec) chapter_results.append(result) status = "ok" if result["ok"] else f"ERROR: {result.get('error')}" print(f" Chapter {chapter_num}: {status}") @@ -592,14 +607,16 @@ class SuwayomiMover: al_id = ComicInfoBuilder._al_id_from_source(md) if md else None if mal_id or al_id: try: - person_result = self._person_updater.update_for_manga( - mal_id, al_manga_id=al_id) + with series_rec.measure("person_sync"): + person_result = self._person_updater.update_for_manga( + mal_id, al_manga_id=al_id) print(f" Persons: chars={person_result['characters'].get('updated')} " f"staff={person_result['staff'].get('updated')}") except Exception as exc: person_result = {"error": str(exc)} print(f" Persons: ERROR {exc}") + series_rec.finish() return {"chapters": chapter_results, "persons": person_result} # ------------------------------------------------------------------ @@ -609,7 +626,8 @@ class SuwayomiMover: builder: ComicInfoBuilder, chapter_num: str, chapter_dir: Path, - dest_series: Path) -> dict: + dest_series: Path, + series_rec=None) -> dict: """ Generates ComicInfo.xml for one chapter, packs it to CBZ, and optionally removes the source folder. @@ -619,6 +637,11 @@ class SuwayomiMover: element correctly points to the front cover). """ cbz_path = dest_series / f"{chapter_dir.name}.cbz" + chap_rec = (series_rec or self._perf.begin_run().begin_series("") + ).begin_chapter(chapter_num) + # add_pages_from_folder records its own sub-steps on this recorder. + builder.perf = chap_rec + ok = False try: builder.chapter = chapter_num builder.add_pages_from_folder(chapter_dir, cover_filename="000") @@ -626,18 +649,26 @@ class SuwayomiMover: # by add_pages_from_folder, so it's effectively free. Used by # the chapter index in the Kavita destination folder. try: - volume = builder._determine_volume() + with chap_rec.measure("volume"): + volume = builder._determine_volume() except Exception: volume = None - builder.save_xml(chapter_dir) - _pack_to_cbz(chapter_dir, cbz_path) + with chap_rec.measure("save_xml"): + builder.save_xml(chapter_dir) + with chap_rec.measure("pack_cbz"): + _pack_to_cbz(chapter_dir, cbz_path) if self._delete_source: - shutil.rmtree(chapter_dir) + with chap_rec.measure("delete_source"): + shutil.rmtree(chapter_dir) + ok = True return {"chapter": chapter_num, "cbz": str(cbz_path), "ok": True, "volume": volume} except Exception as exc: return {"chapter": chapter_num, "cbz": str(cbz_path), "ok": False, "error": str(exc)} + finally: + builder.perf = None + chap_rec.finish(ok=ok) # -------------------------------------------------------------------------- From b6d7f2d0afda4022cf353b28a33afed6a18f9f3b Mon Sep 17 00:00:00 2001 From: JohannesBOT Date: Mon, 15 Jun 2026 11:23:37 +0200 Subject: [PATCH 2/4] time measurement --- src/manga/PerfStats.py | 242 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 src/manga/PerfStats.py diff --git a/src/manga/PerfStats.py b/src/manga/PerfStats.py new file mode 100644 index 0000000..714a46f --- /dev/null +++ b/src/manga/PerfStats.py @@ -0,0 +1,242 @@ +""" +perf_stats.py +============= + +Lightweight performance profiler for the Suwayomi -> Kavita move pipeline. + +It records, per move run, how long each step of every chapter takes plus +per-series and per-run totals, so a slowdown can be traced to the step +responsible (cover download, image-dimension probing, CBZ packing, …). + +Data model (one entry per run, newest first):: + + { + "runs": [ + { + "startedAt": 1700000000, # unix seconds + "finishedAt": 1700000123, + "totalSeconds": 123.4, # wall clock of the whole run + "seriesCount": 2, + "chapterCount": 31, + "stepTotals": { # summed over ALL chapters + "cover": 41.2, "image_dimensions": 55.8, "pack_cbz": 18.1, ... + }, + "seriesStepTotals": { # summed over ALL series + "fetch_metadata": 2.4, "person_sync": 9.7 + }, + "series": [ + { + "title": "Call of the Night", + "totalSeconds": 60.2, + "chapterCount": 20, + "steps": {"fetch_metadata": 1.2, "person_sync": 3.4}, + "chapters": [ + {"chapter": "1", "ok": true, "totalSeconds": 11.5, + "steps": {"cover": 1.8, "image_dimensions": 4.2, ...}} + ] + } + ] + } + ] + } + +Usage from the mover:: + + perf = PerfStats(path) # path=None -> disabled (no-op) + run = perf.begin_run() + series = run.begin_series("Title") + with series.measure("fetch_metadata"): + ... + chap = series.begin_chapter("1") + with chap.measure("pack_cbz"): + ... + chap.finish(ok=True) + series.finish() + run.finish() # persists the run to disk + +When ``path`` is None every recorder is a no-op and nothing is written, +so the profiler can be left permanently wired in with negligible cost. +""" + +from __future__ import annotations + +import json +import threading +import time +from contextlib import contextmanager +from pathlib import Path + + +# Keep the JSON small: only the most recent runs are retained on disk. +_MAX_RUNS = 30 + + +class _StepTimer: + """ + Base recorder: accumulates ``{step_name: seconds}`` and tracks its own + wall-clock lifetime. ``enabled=False`` turns every method into a no-op. + """ + + def __init__(self, enabled: bool = True): + self.steps: dict[str, float] = {} + self._enabled = enabled + self._t0 = time.monotonic() + + @contextmanager + def measure(self, name: str): + """Context manager timing a named step (accumulates on repeat use).""" + if not self._enabled: + yield + return + start = time.monotonic() + try: + yield + finally: + self.steps[name] = round( + self.steps.get(name, 0.0) + (time.monotonic() - start), 4) + + def elapsed(self) -> float: + return round(time.monotonic() - self._t0, 4) + + +class ChapterRecorder(_StepTimer): + """Per-chapter step timer.""" + + def __init__(self, series: "SeriesRecorder", chapter: str, + enabled: bool = True): + super().__init__(enabled) + self._series = series + self._chapter = chapter + self._ok = True + + def finish(self, *, ok: bool = True) -> None: + self._ok = ok + if not self._enabled: + return + self._series._chapters.append({ + "chapter": self._chapter, + "ok": ok, + "totalSeconds": self.elapsed(), + "steps": self.steps, + }) + + +class SeriesRecorder(_StepTimer): + """Per-series step timer; also collects its chapters.""" + + def __init__(self, run: "RunRecorder", title: str, enabled: bool = True): + super().__init__(enabled) + self._run = run + self._title = title + self._chapters: list[dict] = [] + + def begin_chapter(self, chapter: str) -> ChapterRecorder: + return ChapterRecorder(self, chapter, enabled=self._enabled) + + def finish(self) -> None: + if not self._enabled: + return + self._run._series.append({ + "title": self._title, + "totalSeconds": self.elapsed(), + "chapterCount": len(self._chapters), + "steps": self.steps, + "chapters": self._chapters, + }) + + +class RunRecorder: + """Top-level recorder for one full move run.""" + + def __init__(self, stats: "PerfStats", enabled: bool = True): + self._stats = stats + self._enabled = enabled + self._series: list[dict] = [] + self._started = time.time() + self._t0 = time.monotonic() + + def begin_series(self, title: str) -> SeriesRecorder: + return SeriesRecorder(self, title, enabled=self._enabled) + + def finish(self) -> dict | None: + """Aggregates the run and persists it. Returns the run dict.""" + if not self._enabled: + return None + + step_totals: dict[str, float] = {} + series_step_totals: dict[str, float] = {} + chapter_count = 0 + for s in self._series: + for step, secs in s["steps"].items(): + series_step_totals[step] = round( + series_step_totals.get(step, 0.0) + secs, 4) + for ch in s["chapters"]: + chapter_count += 1 + for step, secs in ch["steps"].items(): + step_totals[step] = round( + step_totals.get(step, 0.0) + secs, 4) + + run = { + "startedAt": round(self._started), + "finishedAt": round(time.time()), + "totalSeconds": round(time.monotonic() - self._t0, 4), + "seriesCount": len(self._series), + "chapterCount": chapter_count, + "stepTotals": step_totals, + "seriesStepTotals": series_step_totals, + "series": self._series, + } + self._stats._append_run(run) + return run + + +class PerfStats: + """ + Profiler facade + JSON persistence. + + Parameters + ---------- + path : Destination JSON file. None disables the profiler entirely + (every recorder becomes a no-op and nothing is written). + """ + + def __init__(self, path=None): + self._path = Path(path) if path else None + self._lock = threading.Lock() + + @property + def enabled(self) -> bool: + return self._path is not None + + def begin_run(self) -> RunRecorder: + return RunRecorder(self, enabled=self.enabled) + + # ------------------------------------------------------------------ + # Read / write + # ------------------------------------------------------------------ + def all(self) -> dict: + """Returns the persisted runs ({"runs": [...]}); newest first.""" + if not self._path or not self._path.is_file(): + return {"runs": []} + try: + with self._path.open("r", encoding="utf-8") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError): + return {"runs": []} + if not isinstance(data, dict) or not isinstance(data.get("runs"), list): + return {"runs": []} + return data + + def _append_run(self, run: dict) -> None: + if not self._path: + return + with self._lock: + data = self.all() + runs = data["runs"] + runs.insert(0, run) # newest first + del runs[_MAX_RUNS:] # cap history + self._path.parent.mkdir(parents=True, exist_ok=True) + tmp = self._path.with_suffix(self._path.suffix + ".tmp") + with tmp.open("w", encoding="utf-8") as f: + json.dump({"runs": runs}, f, ensure_ascii=False, indent=2) + tmp.replace(self._path) From a59cff395199a00bbd253344a30195897cf0f1a8 Mon Sep 17 00:00:00 2001 From: JohannesBOT Date: Tue, 16 Jun 2026 11:37:47 +0200 Subject: [PATCH 3/4] Performance Improvements --- main_manga.py | 3 +- src/MangaBakaWorksResolver.py | 18 +++++------ src/manga/ComicInfoBuilder.py | 31 ++++++++++++++++++ src/manga/MangadexVolumeResolver.py | 33 +++++++++++++------ src/manga/PerfStats.py | 49 ++++++++++++++++++++++------- 5 files changed, 102 insertions(+), 32 deletions(-) diff --git a/main_manga.py b/main_manga.py index 6210494..606accf 100644 --- a/main_manga.py +++ b/main_manga.py @@ -44,7 +44,6 @@ from __future__ import annotations import os import sys from pathlib import Path - try: from dotenv import load_dotenv load_dotenv() @@ -107,7 +106,7 @@ def main() -> int: web_host = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0" web_port = _env_int("WEB_PORT", 8080) updater_enabled = _env_bool("UPDATER_ENABLED", True) - updater_schedule = _env_str("UPDATER_SCHEDULE", "0 19 * * 1,4") + updater_schedule = _env_str("UPDATER_SCHEDULE", "0 10 * * 1,4") updater_log = _env_str("UPDATER_LOG", "/config/volume_updater.log") cover_cache_path = _env_str("COVER_CACHE_PATH", "") or None perf_path = _env_str("PERF_PATH", "/config/perf_stats.json") or None diff --git a/src/MangaBakaWorksResolver.py b/src/MangaBakaWorksResolver.py index 5415071..fa3fbc5 100644 --- a/src/MangaBakaWorksResolver.py +++ b/src/MangaBakaWorksResolver.py @@ -151,9 +151,10 @@ class MangaBakaWorksResolver: Returns volume-level works for a series, filtered to those that have a usable cover image. - Non-empty results are cached per series; empty results are not, so - works added on MangaBaka later become visible without restarting - the (long-running) process. + Results are cached per series — including empty results, so a series + without works is not re-paginated for every chapter of a move run. + The periodic cover updater calls clear_cache() before each scan, so + works added on MangaBaka later are still picked up there. """ if not series_id: return [] @@ -165,8 +166,7 @@ class MangaBakaWorksResolver: # Discard works that carry no usable cover works_with_cover = [w for w in all_works if w.get("images")] - if works_with_cover: - self._cache[series_id] = works_with_cover + self._cache[series_id] = works_with_cover return works_with_cover def get_work_for_volume(self, series_id: str, volume) -> "dict | None": @@ -228,10 +228,10 @@ class MangaBakaWorksResolver: if url: result[norm] = url - # Empty results are not cached — covers added on MangaBaka later - # become visible without restarting the long-running process. - if result: - self._images_cache[series_id] = result + # Cache even an empty result so a series without volume images is not + # re-paginated for every chapter. The periodic cover updater clears + # this cache before each scan, so newly added images are still found. + self._images_cache[series_id] = result return result def get_cover_for_volume_from_images(self, series_id: str, diff --git a/src/manga/ComicInfoBuilder.py b/src/manga/ComicInfoBuilder.py index c6f9abf..50d09a0 100644 --- a/src/manga/ComicInfoBuilder.py +++ b/src/manga/ComicInfoBuilder.py @@ -72,6 +72,10 @@ def _no_measure(): yield +# Sentinel marking a per-chapter memo slot as "not computed yet". +_UNSET = object() + + # -------------------------------------------------------------------------- # Constants # -------------------------------------------------------------------------- @@ -235,6 +239,12 @@ class ComicInfoBuilder: self._pages: list[dict] = [] self._cover_path: "Path | None" = None self._suwayomi_data: dict = {} + # Per-chapter memo for _determine_volume (resolved up to 3x/chapter + # otherwise: cover download, explicit volume step, XML build). + self._volume_memo = _UNSET + # Per-series cache for full series fetches by id (parent series for + # SeriesGroup, merged-series redirects) — reused across all chapters. + self._series_by_id_cache: dict[str, dict] = {} # ----- Repr ----------------------------------------------------------- def __repr__(self) -> str: @@ -274,6 +284,7 @@ class ComicInfoBuilder: self._pages = [] self._cover_path = None self._suwayomi_data = {} + self._volume_memo = _UNSET def _measure(self, name: str): """Times a named step on the attached recorder; no-op when unset.""" @@ -437,12 +448,20 @@ class ComicInfoBuilder: return series def _fetch_series_by_id(self, series_id) -> dict: + # Cached per builder (i.e. per series): SeriesGroup resolution calls + # this for the parent on every chapter — without the cache that is + # one MangaBaka request per chapter for the same parent id. + key = str(series_id) + cached = self._series_by_id_cache.get(key) + if cached is not None: + return cached url = f"{self.api_base_url}/series/{series_id}" resp = self._session.get(url, timeout=self.request_timeout) resp.raise_for_status() data = resp.json().get("data") if not data: raise RuntimeError(f"Series with ID {series_id} not found.") + self._series_by_id_cache[key] = data return data # ====================================================================== @@ -578,6 +597,18 @@ class ComicInfoBuilder: # Volume determination # ====================================================================== def _determine_volume(self) -> "str | None": + """ + Resolves the volume for the current chapter, memoized per chapter. + + The result is reused across the three call sites per chapter (cover + download, explicit volume step, XML build); the memo is cleared + whenever the chapter or manga title changes (see _clear_results). + """ + if self._volume_memo is _UNSET: + self._volume_memo = self._resolve_volume() + return self._volume_memo + + def _resolve_volume(self) -> "str | None": """ Resolves the volume for the current chapter via MangaDex. Falls back to estimation when the chapter is absent from MangaDex. diff --git a/src/manga/MangadexVolumeResolver.py b/src/manga/MangadexVolumeResolver.py index 1c7c544..b121233 100644 --- a/src/manga/MangadexVolumeResolver.py +++ b/src/manga/MangadexVolumeResolver.py @@ -93,6 +93,9 @@ class MangaDexVolumeResolver: self._cache: dict[str, dict] = {} # Cache: manga_id -> {relation_type: [title, ...]} self._relations_cache: dict[str, dict] = {} + # Cache: title_lower -> manga_id (or None) — avoids repeating the + # MangaDex search for every chapter of the same series. + self._id_cache: dict[str, "str | None"] = {} # ---------------------------------------------------------------------- # Locate the manga ID @@ -105,15 +108,25 @@ class MangaDexVolumeResolver: if not title or not title.strip(): return None - resp = self._session.get( - f"{self.base_url}/manga", - params={"title": title, "limit": 5, - "contentRating[]": ["safe", "suggestive", - "erotica", "pornographic"]}, - timeout=self.request_timeout) - resp.raise_for_status() - results = resp.json().get("data") or [] + key = title.strip().lower() + if key in self._id_cache: + return self._id_cache[key] + + try: + resp = self._session.get( + f"{self.base_url}/manga", + params={"title": title, "limit": 5, + "contentRating[]": ["safe", "suggestive", + "erotica", "pornographic"]}, + timeout=self.request_timeout) + resp.raise_for_status() + results = resp.json().get("data") or [] + except requests.RequestException: + # Don't cache transient failures — allow a retry next time. + return None + if not results: + self._id_cache[key] = None return None def score(entry) -> float: @@ -130,7 +143,9 @@ class MangaDexVolumeResolver: return best results.sort(key=score, reverse=True) - return results[0].get("id") + manga_id = results[0].get("id") + self._id_cache[key] = manga_id + return manga_id # ---------------------------------------------------------------------- # Main function: retrieve and return volume / chapter data diff --git a/src/manga/PerfStats.py b/src/manga/PerfStats.py index 714a46f..8b70d33 100644 --- a/src/manga/PerfStats.py +++ b/src/manga/PerfStats.py @@ -63,6 +63,7 @@ from __future__ import annotations import json import threading import time +import uuid from contextlib import contextmanager from pathlib import Path @@ -143,6 +144,9 @@ class SeriesRecorder(_StepTimer): "steps": self.steps, "chapters": self._chapters, }) + # Persist the run's progress after every series so a long run is + # observable live and survives a crash mid-run. + self._run.flush() class RunRecorder: @@ -154,15 +158,15 @@ class RunRecorder: self._series: list[dict] = [] self._started = time.time() self._t0 = time.monotonic() + # Stable identity so incremental flushes update the same run entry + # instead of inserting a duplicate on every series. + self._run_id = uuid.uuid4().hex def begin_series(self, title: str) -> SeriesRecorder: return SeriesRecorder(self, title, enabled=self._enabled) - def finish(self) -> dict | None: - """Aggregates the run and persists it. Returns the run dict.""" - if not self._enabled: - return None - + def _snapshot(self) -> dict: + """Aggregates the run's current state into a serialisable dict.""" step_totals: dict[str, float] = {} series_step_totals: dict[str, float] = {} chapter_count = 0 @@ -176,7 +180,8 @@ class RunRecorder: step_totals[step] = round( step_totals.get(step, 0.0) + secs, 4) - run = { + return { + "runId": self._run_id, "startedAt": round(self._started), "finishedAt": round(time.time()), "totalSeconds": round(time.monotonic() - self._t0, 4), @@ -186,9 +191,19 @@ class RunRecorder: "seriesStepTotals": series_step_totals, "series": self._series, } - self._stats._append_run(run) + + def flush(self) -> dict | None: + """Writes the run's current state to disk (upsert by runId).""" + if not self._enabled: + return None + run = self._snapshot() + self._stats._upsert_run(run) return run + def finish(self) -> dict | None: + """Persists the final run state. Returns the run dict.""" + return self.flush() + class PerfStats: """ @@ -227,14 +242,24 @@ class PerfStats: return {"runs": []} return data - def _append_run(self, run: dict) -> None: + def _upsert_run(self, run: dict) -> None: + """ + Inserts a new run (newest first) or replaces the existing entry with + the same runId — so incremental flushes during a run update one entry + rather than appending a duplicate after every series. + """ if not self._path: return with self._lock: - data = self.all() - runs = data["runs"] - runs.insert(0, run) # newest first - del runs[_MAX_RUNS:] # cap history + runs = self.all()["runs"] + run_id = run.get("runId") + for i, existing in enumerate(runs): + if existing.get("runId") == run_id: + runs[i] = run + break + else: + runs.insert(0, run) # newest first + del runs[_MAX_RUNS:] # cap history self._path.parent.mkdir(parents=True, exist_ok=True) tmp = self._path.with_suffix(self._path.suffix + ".tmp") with tmp.open("w", encoding="utf-8") as f: From 6ca1a245a370acbfc270d163309651c9e378f80f Mon Sep 17 00:00:00 2001 From: JohannesBOT Date: Tue, 16 Jun 2026 18:46:17 +0200 Subject: [PATCH 4/4] Person Updater overhaul --- .env.example | 7 +- docker-compose.prod.yml | 17 +- main_ln.py | 33 ++ main_manga.py | 78 +++-- src/CronRunner.py | 87 +++++ src/KavitaClient.py | 25 ++ src/KavitaPersonUpdater.py | 478 ++++++++------------------ src/PerfStats.py | 254 ++++++++++++++ src/PerfWebPage.py | 160 +++++++++ src/TextUtils.py | 27 ++ src/ln/LightNovelOrchestrator.py | 23 +- src/ln/MatchesWebApp.py | 51 +++ src/manga/KavitaVolumeCoverUpdater.py | 146 +++----- src/manga/MatchesWebApp.py | 193 +++-------- src/manga/PerfStats.py | 267 -------------- src/manga/SuwayomiMover.py | 58 +--- 16 files changed, 984 insertions(+), 920 deletions(-) create mode 100644 src/CronRunner.py create mode 100644 src/PerfStats.py create mode 100644 src/PerfWebPage.py delete mode 100644 src/manga/PerfStats.py diff --git a/.env.example b/.env.example index 13eb8fb..a33a046 100644 --- a/.env.example +++ b/.env.example @@ -11,13 +11,18 @@ HOST_MANGA_CONFIG_PATH=/path/to/manga-config MANGA_WEB_PORT=8080 SETTLE_SECONDS=600 DELETE_SOURCE=true +# Periodic updaters (volume/cover + global person sync) run together on +# this cron. Sundays 10:00. Person updater also covers LN libraries. UPDATER_ENABLED=true -UPDATER_SCHEDULE=0 19 * * 1,4 +UPDATER_SCHEDULE=0 10 * * 0 COVER_CACHE_PATH=/config/covers PERF_PATH=/config/perf_stats.json +VOLUME_PERF_PATH=/config/volume_perf_stats.json +PERSON_PERF_PATH=/config/person_perf_stats.json # Light-novel container (kavita-lightnovel-metadata-fetcher) HOST_LN_CONFIG_PATH=/path/to/ln-config LN_WEB_PORT=8081 LN_LIBRARY_IDS=3,5 +LN_UPDATER_ENABLED=true diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 4d03f31..1e8d272 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -13,17 +13,18 @@ services: SETTLE_SECONDS: "${SETTLE_SECONDS:-600}" DELETE_SOURCE: "${DELETE_SOURCE:-true}" MATCH_PATH: "/config/matches.json" - # Volume/cover back-fill updater + # Periodic updaters (volume/cover back-fill + global person sync) run + # together on this cron. "0 10 * * 0" = Sundays 10:00 (local time, see TZ) UPDATER_ENABLED: "${UPDATER_ENABLED:-true}" - # Cron expression: "0 19 * * 1,4" = 19:00 every Monday and Thursday - # (local time, see TZ) - UPDATER_SCHEDULE: "${UPDATER_SCHEDULE:-0 19 * * 1,4}" + UPDATER_SCHEDULE: "${UPDATER_SCHEDULE:-0 10 * * 0}" UPDATER_LOG: "/config/volume_updater.log" # Persistent cover cache (empty = temp dir, deleted on container stop) COVER_CACHE_PATH: "${COVER_CACHE_PATH:-/config/covers}" - # Per-step move timing stats (viewable at /perf); empty disables it + # Per-step timing stats (viewable at /perf, /perf/volume, /perf/person) PERF_PATH: "${PERF_PATH:-/config/perf_stats.json}" - # Timezone for the cron schedule — without this 19:00 means 19:00 UTC + VOLUME_PERF_PATH: "${VOLUME_PERF_PATH:-/config/volume_perf_stats.json}" + PERSON_PERF_PATH: "${PERSON_PERF_PATH:-/config/person_perf_stats.json}" + # Timezone for the cron schedule — without this 10:00 means 10:00 UTC TZ: "${TZ:-Europe/Berlin}" ports: - "${MANGA_WEB_PORT:-8080}:8080" @@ -45,6 +46,10 @@ services: LIBRARY_IDS: "${LN_LIBRARY_IDS}" LANGUAGE: "${LANGUAGE:-en}" MATCH_PATH: "/config/matches.json" + # Global person sync on cron (same default cadence as the manga side) + UPDATER_ENABLED: "${LN_UPDATER_ENABLED:-true}" + UPDATER_SCHEDULE: "${UPDATER_SCHEDULE:-0 10 * * 0}" + PERSON_PERF_PATH: "${PERSON_PERF_PATH:-/config/person_perf_stats.json}" TZ: "${TZ:-Europe/Berlin}" ports: - "${LN_WEB_PORT:-8081}:8080" diff --git a/main_ln.py b/main_ln.py index 792a6af..f86fa45 100644 --- a/main_ln.py +++ b/main_ln.py @@ -27,6 +27,12 @@ Environment variables MATCH_PATH default /config/matches.json WEB_PORT default 8080 WEB_HOST default 0.0.0.0 + UPDATER_ENABLED default true (run the person updater on cron) + UPDATER_SCHEDULE cron expression for the person updater, + default "0 10 * * 0" = Sundays 10:00 + (local time — set TZ inside the container!) + PERSON_PERF_PATH JSON file for person updater timing. + Default /config/person_perf_stats.json """ from __future__ import annotations @@ -51,6 +57,15 @@ sys.path.insert(0, str(_BASE / "src" / "ln")) from MatchesCache import MatchesCache # noqa: E402 from LightNovelOrchestrator import LightNovelOrchestrator # noqa: E402 from MatchesWebApp import MatchesWebApp # noqa: E402 +from PerfStats import PerfStats # noqa: E402 +from CronRunner import CronRunner # noqa: E402 + + +def _env_bool(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None: + return default + return raw.strip().lower() in ("1", "true", "yes", "y", "on") def _env_str(name: str, default: "str | None" = None, @@ -98,6 +113,10 @@ def main() -> int: web_host = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0" web_port = _env_int("WEB_PORT", 8080) library_ids = _env_int_list("LIBRARY_IDS") + updater_enabled = _env_bool("UPDATER_ENABLED", True) + updater_schedule = _env_str("UPDATER_SCHEDULE", "0 10 * * 0") + person_perf_path = _env_str("PERSON_PERF_PATH", + "/config/person_perf_stats.json") or None print(f"[main] kavita url = {kavita_url}", flush=True) print(f"[main] language = {language}", flush=True) @@ -107,6 +126,7 @@ def main() -> int: print(f"[main] web = {web_host}:{web_port}", flush=True) cache = MatchesCache(match_path) + person_perf = PerfStats(person_perf_path) orchestrator = LightNovelOrchestrator( kavita_url=kavita_url, kavita_api_key=kavita_api_key, @@ -118,9 +138,22 @@ def main() -> int: app = MatchesWebApp( cache, orchestrator=orchestrator, default_library_ids=library_ids, + person_perf=person_perf, host=web_host, port=web_port, ) app.start() + + if updater_enabled: + try: + CronRunner( + updater_schedule, + lambda: orchestrator.sync_persons(trigger="cron", + perf=person_perf), + name="person-updater").start() + except ValueError as exc: + print(f"[main] UPDATER_SCHEDULE invalid ({exc}); " + f"scheduled person sync DISABLED", flush=True) + app.wait() return 0 diff --git a/main_manga.py b/main_manga.py index 606accf..b8dc0d0 100644 --- a/main_manga.py +++ b/main_manga.py @@ -28,15 +28,19 @@ Environment variables MATCH_PATH default /config/matches.json WEB_PORT default 8080 (Flask web UI for matches.json) WEB_HOST default 0.0.0.0 - UPDATER_ENABLED default true (volume/cover back-fill cron) - UPDATER_SCHEDULE cron expression for the updater scans, - default "0 19 * * 1,4" = 19:00 every Mon + Thu + UPDATER_ENABLED default true (run volume/cover + person updaters on cron) + UPDATER_SCHEDULE cron expression for the periodic updaters, + default "0 10 * * 0" = Sundays 10:00 (local time — set TZ inside the container!) UPDATER_LOG default /config/volume_updater.log COVER_CACHE_PATH directory for the persistent cover cache; empty (default) = temporary cache, deleted on exit - PERF_PATH JSON file for per-step move timing stats; - empty disables profiling. Default /config/perf_stats.json + PERF_PATH JSON file for per-step move timing stats. + Default /config/perf_stats.json (empty disables it) + VOLUME_PERF_PATH JSON file for volume/cover updater timing. + Default /config/volume_perf_stats.json + PERSON_PERF_PATH JSON file for person updater timing. + Default /config/person_perf_stats.json """ from __future__ import annotations @@ -62,7 +66,10 @@ from SuwayomiFolderWatcher import SuwayomiFolderWatcher # noqa: E402,F401 from MatchesCache import MatchesCache # noqa: E402 from MatchesWebApp import MatchesWebApp # noqa: E402 from KavitaVolumeCoverUpdater import KavitaVolumeCoverUpdater # noqa: E402 +from KavitaClient import KavitaClient # noqa: E402 +from KavitaPersonUpdater import KavitaPersonUpdater # noqa: E402 from PerfStats import PerfStats # noqa: E402 +from CronRunner import CronRunner # noqa: E402 def _env_str(name: str, default: "str | None" = None, @@ -106,10 +113,14 @@ def main() -> int: web_host = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0" web_port = _env_int("WEB_PORT", 8080) updater_enabled = _env_bool("UPDATER_ENABLED", True) - updater_schedule = _env_str("UPDATER_SCHEDULE", "0 10 * * 1,4") + updater_schedule = _env_str("UPDATER_SCHEDULE", "0 10 * * 0") updater_log = _env_str("UPDATER_LOG", "/config/volume_updater.log") cover_cache_path = _env_str("COVER_CACHE_PATH", "") or None perf_path = _env_str("PERF_PATH", "/config/perf_stats.json") or None + volume_perf_path = _env_str("VOLUME_PERF_PATH", + "/config/volume_perf_stats.json") or None + person_perf_path = _env_str("PERSON_PERF_PATH", + "/config/person_perf_stats.json") or None print(f"[main] suwayomi = {suwayomi_path}", flush=True) print(f"[main] kavita = {kavita_path}", flush=True) @@ -121,43 +132,62 @@ def main() -> int: print(f"[main] web = {web_host}:{web_port}", flush=True) matches_cache = MatchesCache(match_path) - perf_stats = PerfStats(perf_path) + perf_move = PerfStats(perf_path) + perf_volume = PerfStats(volume_perf_path) + perf_person = PerfStats(person_perf_path) mover = SuwayomiMover( suwayomi_path, kavita_path, - kavita_base_url=kavita_url, - kavita_api_key=kavita_api_key, language=language, request_timeout=request_timeout, delete_source=delete_source, matches_cache=matches_cache, cover_cache_dir=cover_cache_path, - perf_stats=perf_stats, + perf_stats=perf_move, ) + # Standalone, global, id-based person updater (manga + LN libraries). + person_updater = None + if kavita_api_key: + kavita_client = KavitaClient(kavita_url, kavita_api_key, + request_timeout=request_timeout) + person_updater = KavitaPersonUpdater(kavita_client) + # watcher = SuwayomiFolderWatcher(suwayomi_path, mover, settle_seconds=settle_seconds) - web_app = MatchesWebApp(matches_cache, mover=mover, perf_stats=perf_stats, - host=web_host, port=web_port) + web_app = MatchesWebApp( + matches_cache, mover=mover, + person_updater=person_updater, person_trigger="web", + perf_stats={"move": perf_move, "volume": perf_volume, + "person": perf_person}, + host=web_host, port=web_port) web_app.start() if updater_enabled: + updater = KavitaVolumeCoverUpdater( + kavita_path, + matches_cache=matches_cache, + language=language, + request_timeout=request_timeout, + log_path=updater_log, + cover_cache_dir=cover_cache_path, + perf_stats=perf_volume, + ) + + def _scheduled_job(): + updater.update_all() + if person_updater is not None: + person_updater.update_all_persons(trigger="cron", + perf=perf_person) + try: - updater = KavitaVolumeCoverUpdater( - kavita_path, - matches_cache=matches_cache, - language=language, - request_timeout=request_timeout, - log_path=updater_log, - schedule=updater_schedule, - cover_cache_dir=cover_cache_path, - ) - updater.start() + CronRunner(updater_schedule, _scheduled_job, + name="updaters").start() except ValueError as exc: # Invalid cron expression — keep the service up, just without - # the updater, and make the config error obvious in the logs. + # the scheduled updaters, and surface the config error. print(f"[main] UPDATER_SCHEDULE invalid ({exc}); " - f"volume/cover updater DISABLED", flush=True) + f"scheduled updaters DISABLED", flush=True) # watcher.start() # watcher.wait() # blocks until stop() is called via a signal diff --git a/src/CronRunner.py b/src/CronRunner.py new file mode 100644 index 0000000..e8e2a50 --- /dev/null +++ b/src/CronRunner.py @@ -0,0 +1,87 @@ +""" +cron_runner.py +============== + +Runs a single callable on a cron schedule on a background thread. + +Decouples *what* runs from *when*: both the manga container (volume/cover +updater + person updater) and the LN container (person updater) schedule +their work through this one helper, using a shared ``CronSchedule`` for the +``next_after`` arithmetic. + +Usage:: + + runner = CronRunner("0 10 * * 0", job=my_callable) # Sundays 10:00 + runner.start() + ... + runner.stop() + +When the schedule string is invalid, the CronSchedule constructor raises +ValueError — the caller decides whether to disable the runner or fall back. +The schedule is evaluated in local time (set TZ inside the container). +""" + +from __future__ import annotations + +import threading +from datetime import datetime + +from CronSchedule import CronSchedule + + +def _now() -> str: + return datetime.now().isoformat(timespec="seconds") + + +class CronRunner: + """ + Fires ``job()`` whenever the cron ``schedule`` elapses. + + Parameters + ---------- + schedule : 5-field cron expression (see CronSchedule). + job : Zero-arg callable invoked on each scheduled tick. Exceptions + are caught and logged so a failing run does not kill the loop. + name : Thread name (for logs). + """ + + def __init__(self, schedule: str, job, *, name: str = "CronRunner"): + self._cron = CronSchedule(schedule) + self._job = job + self._name = name + self._stop = threading.Event() + self._thread: "threading.Thread | None" = None + + def start(self) -> None: + """Starts the scheduling thread. Non-blocking.""" + if self._thread is not None and self._thread.is_alive(): + return + self._stop.clear() + self._thread = threading.Thread( + target=self._loop, name=self._name, daemon=True) + self._thread.start() + print(f"[{_now()}] [{self._name}] scheduled on " + f"cron '{self._cron.expression}'", flush=True) + + def stop(self) -> None: + """Signals the loop to stop (a job already running finishes first).""" + self._stop.set() + if self._thread is not None: + self._thread.join(timeout=10) + + def wait(self) -> None: + """Blocks the calling thread until stop() is invoked.""" + self._stop.wait() + + def _loop(self) -> None: + while not self._stop.is_set(): + next_run = self._cron.next_after(datetime.now()) + wait = max(0.0, (next_run - datetime.now()).total_seconds()) + print(f"[{_now()}] [{self._name}] next run: " + f"{next_run.isoformat(timespec='minutes')}", flush=True) + if self._stop.wait(wait): + break + try: + self._job() + except Exception as exc: + print(f"[{_now()}] [{self._name}] job ERROR: {exc}", flush=True) diff --git a/src/KavitaClient.py b/src/KavitaClient.py index 6a980f6..400e87f 100644 --- a/src/KavitaClient.py +++ b/src/KavitaClient.py @@ -204,6 +204,31 @@ class KavitaClient: r.raise_for_status() return r.json() or [] + def list_all_persons(self, *, page_size: int = 200) -> list[dict]: + """ + Returns every PersonDto in the instance. + + Pages through POST /api/Person/all (the browse endpoint) with an + empty filter until an empty page is returned — same paging pattern + as list_series_in_library. + """ + results: list[dict] = [] + page = 1 + while True: + r = self._session.post( + f"{self._base}/api/Person/all", + params={"PageNumber": page, "PageSize": page_size}, + json={}, timeout=self._timeout) + r.raise_for_status() + chunk = r.json() or [] + if not chunk: + break + results.extend(chunk) + if len(chunk) < page_size: + break + page += 1 + return results + def update_person(self, payload: dict) -> None: """Writes a person record (malId, aniListId, description, …).""" r = self._session.post(f"{self._base}/api/Person/update", diff --git a/src/KavitaPersonUpdater.py b/src/KavitaPersonUpdater.py index ae0036b..5d2e30c 100644 --- a/src/KavitaPersonUpdater.py +++ b/src/KavitaPersonUpdater.py @@ -2,407 +2,220 @@ kavita_person_updater.py ======================== -Synchronises Kavita person / character records with MyAnimeList data. +Synchronises Kavita character person-records with MyAnimeList / AniList data. -For every character and staff member that MAL knows about for a given manga -the updater: - 1. Searches Kavita for a matching Person record (by name similarity / - alias match, configurable threshold). - 2. Sets the MAL ID on the Kavita person if it is not yet linked. - 3. Uploads the MAL profile image when the cover is not locked and has - not been set in a previous sync run. - 4. Populates the description field when Kavita has none and MAL provides - an 'about' text (requires an extra Jikan request per character; only - performed when update_descriptions=True). +Global, id-based mode +--------------------- +Kavita person-records are created with a disambiguated name carrying the +tracker *character* id, e.g. ``Rem (MAL 118737)`` (manga: written into +ComicInfo ; light novels: written by the metadata builder). +``update_all_persons`` walks **every** person in the Kavita instance, reads +that id from the name, looks the character up on MAL / AniList by id, and +writes back: + + * the tracker id into the ``malId`` / ``aniListId`` field (when still empty), + * a description (when the record has none), + * the profile image (when not locked and not already set). + +Persons whose name carries no id (authors / staff, which are not +disambiguated) are skipped. A record already linked to a *different* +tracker id than its name says is reported as a conflict and left untouched. + +This mode is format-independent (it only does id lookups, never title +searches) so a single pass covers both the manga and light-novel libraries. All HTTP traffic to Kavita goes through the shared :class:`KavitaClient` -(`/api/Person/search`, `/api/Person/update`, `/api/Upload/person`). +(`/api/Person/all`, `/api/Person/update`, `/api/Upload/person`). Tested against Kavita 0.9.0.2. """ from __future__ import annotations -import datetime - import requests from KavitaClient import KavitaClient from MALResolver import MALResolver from AniListResolver import AniListResolver -from TextUtils import best_similarity, paragraphs_to_html, person_name_with_id +from PerfStats import PerfStats +from TextUtils import paragraphs_to_html, parse_person_tracker_id class KavitaPersonUpdater: """ - Syncs Kavita Person records with MyAnimeList data. + Syncs Kavita character person-records with MAL / AniList data, keyed by + the tracker id embedded in each person's name. Parameters ---------- - client : Shared KavitaClient (session, auth, cover uploads) - mal_resolver : Shared MALResolver singleton (created automatically if omitted) - al_resolver : Shared AniListResolver singleton (created automatically if omitted) - min_name_score : Minimum difflib similarity ratio (0–1) required to accept a - Kavita person as a match for a MAL name. Default 0.80. + client : Shared KavitaClient (session, auth, cover uploads). + mal_resolver : Shared MALResolver singleton (created if omitted). + al_resolver : Shared AniListResolver singleton (created if omitted). """ def __init__(self, client: KavitaClient, *, mal_resolver: "MALResolver | None" = None, - al_resolver: "AniListResolver | None" = None, - min_name_score: float = 0.80): + al_resolver: "AniListResolver | None" = None): self._client = client - self._min_score = min_name_score self._mal = mal_resolver or MALResolver() self._al = al_resolver or AniListResolver() - # Cache: normalised name -> list of PersonDto dicts (best matches first) - self._person_search_cache: dict[str, list[dict]] = {} - # ------------------------------------------------------------------ - # Public: combined update + # Public: global person sync # ------------------------------------------------------------------ - def update_for_manga(self, mal_manga_id: "int | None", *, - al_manga_id: "int | None" = None, - update_covers: bool = True, - update_descriptions: bool = True) -> dict: + def update_all_persons(self, *, + trigger: str = "cron", + perf: "PerfStats | None" = None, + update_covers: bool = True, + update_descriptions: bool = True) -> dict: """ - Runs a full update pass for both characters and staff of the manga. - MAL is tried first; AniList is used as fallback when MAL returns nothing. + Walks every Kavita person, syncing the ones whose name carries a + tracker character id. - Returns - ------- - { - "characters": {"updated": n, "skipped": n, "not_found": n}, - "staff": {"updated": n, "skipped": n, "not_found": n}, - } + Parameters + ---------- + trigger : Source that started this run ("cron" | "web" | "ln") — + recorded in the perf-stats run meta. + perf : Optional PerfStats for per-person step timing. + + Returns {"trigger", "updated", "skipped", "not_found", + "conflicts", "errors"}. """ - return { - "characters": self.update_characters( - mal_manga_id, al_manga_id=al_manga_id, - update_covers=update_covers, - update_descriptions=update_descriptions), - "staff": self.update_staff( - mal_manga_id, al_manga_id=al_manga_id, - update_covers=update_covers, - update_descriptions=update_descriptions), - } + perf = perf or PerfStats(None) + run = perf.begin_run(meta={"trigger": trigger}) + result: dict = {"trigger": trigger, "updated": 0, "skipped": 0, + "not_found": 0, "conflicts": 0, "errors": []} - # ------------------------------------------------------------------ - # Public: character update - # ------------------------------------------------------------------ - def update_characters(self, mal_manga_id: "int | None", *, - al_manga_id: "int | None" = None, - update_covers: bool = True, - update_descriptions: bool = True) -> dict: - """ - Updates Kavita persons that match MAL/AniList characters for the manga. - MAL is tried first; AniList is the fallback when MAL returns nothing. + try: + persons = self._client.list_all_persons() + except requests.RequestException as exc: + result["errors"].append(f"list persons failed: {exc}") + run.finish() + return result - Returns {"updated": n, "skipped": n, "not_found": n}. - """ - entries = self._mal.get_characters_detailed(mal_manga_id) if mal_manga_id else [] - resolver = self._mal - if not entries and al_manga_id: - entries = self._al.get_characters_detailed(al_manga_id) - resolver = self._al - return self._sync_entries(entries, "character", resolver, - update_covers=update_covers, - update_descriptions=update_descriptions) - - # ------------------------------------------------------------------ - # Public: staff update - # ------------------------------------------------------------------ - def update_staff(self, mal_manga_id: "int | None", *, - al_manga_id: "int | None" = None, - update_covers: bool = True, - update_descriptions: bool = True) -> dict: - """ - Updates Kavita persons that match MAL/AniList staff for the manga. - MAL is tried first; AniList is the fallback when MAL returns nothing. - - Returns {"updated": n, "skipped": n, "not_found": n}. - """ - entries = self._mal.get_staff_detailed(mal_manga_id) if mal_manga_id else [] - resolver = self._mal - if not entries and al_manga_id: - entries = self._al.get_staff_detailed(al_manga_id) - resolver = self._al - return self._sync_entries(entries, "staff", resolver, - update_covers=update_covers, - update_descriptions=update_descriptions) - - # ------------------------------------------------------------------ - # Public: cache management - # ------------------------------------------------------------------ - def clear_cache(self) -> None: - """Clears the Kavita person search cache.""" - self._person_search_cache.clear() - - # ------------------------------------------------------------------ - # Internal: main sync loop - # ------------------------------------------------------------------ - def _sync_entries(self, entries: list[dict], kind: str, resolver, *, - update_covers: bool, - update_descriptions: bool) -> dict: - result: dict = {"updated": 0, "skipped": 0, "not_found": 0, - "errors": []} - for entry in entries: - name = (entry.get("name") or "").strip() - raw_name = (entry.get("raw_name") or "").strip() - if not name and not raw_name: + for person in persons: + name = (person.get("name") or "").strip() + parsed = parse_person_tracker_id(name) + if not parsed: + result["skipped"] += 1 # author/staff or un-tagged continue - if kind == "character": - # Characters are stored under their disambiguated name - # ("Rem (MAL 118737)") — see person_name_with_id. The - # series metadata write creates the person under exactly - # this name, so only that form is searched. - search_names = [person_name_with_id( - name, mal_id=entry.get("mal_id"), - al_id=entry.get("al_id"))] - else: - # Staff: cleaned (XML-safe) name first; if Kavita stores - # the legacy comma form, retry with the raw MAL name. - search_names = [name] - if raw_name and raw_name != name: - search_names.append(raw_name) - - matches: list[dict] = [] - for search_name in search_names: - if not search_name: - continue - matches = self._find_kavita_person(search_name) - if matches: - break - - if not matches: - result["not_found"] += 1 - continue - - changed = self._apply_mal_data( - matches[0], entry, kind, resolver, - update_cover=update_covers, - update_desc=update_descriptions, - errors=result["errors"]) - result["updated" if changed else "skipped"] += 1 + source, tracker_id = parsed + item = run.begin_item(name) + ok = True + try: + category = self._apply_to_person( + person, source, tracker_id, item, + update_cover=update_covers, + update_desc=update_descriptions, + errors=result["errors"]) + result[category] += 1 + ok = category != "conflicts" + except Exception as exc: + result["errors"].append(f"{name}: {exc}") + ok = False + finally: + item.finish(ok=ok) + run.finish() + print(f"[persons] trigger={trigger} updated={result['updated']} " + f"skipped={result['skipped']} not_found={result['not_found']} " + f"conflicts={result['conflicts']} errors={len(result['errors'])}", + flush=True) return result # ------------------------------------------------------------------ - # Internal: Kavita person search + # Internal: apply tracker data to one person # ------------------------------------------------------------------ - def _find_kavita_person(self, name: str) -> list[dict]: + def _apply_to_person(self, person: dict, source: str, tracker_id: int, + item, *, update_cover: bool, update_desc: bool, + errors: list) -> str: """ - Searches Kavita for persons matching `name`. - - Checks both the main name and any stored aliases. - Returns persons sorted by similarity, filtered by min_name_score. - Results are cached per (normalised) query name. + Applies MAL/AniList character data to one Kavita person. + Returns the result category: "updated" | "skipped" | "not_found" + | "conflicts". """ - key = name.lower().strip() - if key in self._person_search_cache: - return self._person_search_cache[key] - - try: - persons = self._client.search_persons(name) - except requests.RequestException: - self._person_search_cache[key] = [] - return [] - - scored = [] - for p in persons: - candidates = [p.get("name")] + list(p.get("aliases") or []) - scored.append((best_similarity(key, candidates), p)) - scored.sort(key=lambda pair: pair[0], reverse=True) - filtered = [p for score, p in scored if score >= self._min_score] - self._person_search_cache[key] = filtered - return filtered - - # ------------------------------------------------------------------ - # Internal: apply MAL data to a single Kavita person - # ------------------------------------------------------------------ - def _apply_mal_data(self, person: dict, mal_entry: dict, kind: str, - resolver, *, - update_cover: bool, update_desc: bool, - errors: "list | None" = None) -> bool: - """ - Applies tracker data (MAL or AniList) to one Kavita person record. - - Fields updated - -------------- - - malId : set when the entry carries a MAL ID and it differs - - aniListId : set when the entry carries an AniList ID and it differs - - description: set when empty and the tracker provides a description - - cover image: uploaded when not locked and no prior sync cover exists - - Returns True if any change was made. Failures are appended to the - `errors` list (if provided) instead of being silently swallowed. - """ - person_id: "int | None" = person.get("id") + person_id = person.get("id") if not person_id: - return False + return "skipped" - person_name = person.get("name") or "" + resolver = self._mal if source == "mal" else self._al + id_field = "malId" if source == "mal" else "aniListId" + current = person.get(id_field) or 0 - # Tracker IDs — a MAL entry has mal_id set; an AniList entry has al_id. - mal_id: "int | None" = mal_entry.get("mal_id") - al_id: "int | None" = mal_entry.get("al_id") - entity_id = mal_id or al_id # used for resolver detail calls + # The name is authoritative; a record linked to a different id is a + # data conflict — never overwrite it. + if current and current != tracker_id: + errors.append( + f"conflict: '{person.get('name')}' (#{person_id}) has " + f"{id_field}={current} but name says {tracker_id} — skipped") + return "conflicts" - current_mal_id: int = person.get("malId") or 0 - current_al_id: int = person.get("aniListId") or 0 + with item.measure("detail_fetch"): + details = resolver.get_character_details(tracker_id) + if not details: + return "not_found" - # Collision guard: the Kavita person is already linked to a - # *different* tracker entity — same display name, different - # character/person. Never overwrite; first writer wins. - if ((mal_id and current_mal_id and current_mal_id != mal_id) - or (al_id and current_al_id and current_al_id != al_id)): - if errors is not None: - errors.append( - f"conflict: '{person_name}' (#{person_id}) is linked to " - f"malId={current_mal_id or '-'}/aniListId={current_al_id or '-'} " - f"but this entry has malId={mal_id or '-'}/aniListId={al_id or '-'} " - f"— skipped") - return False - - needs_mal_id = bool(mal_id and current_mal_id != mal_id) - needs_al_id = bool(al_id and current_al_id != al_id) - - # ------ Lazy description fetch ----------------------------------- - description: "str | None" = None + need_id = not current # write id when still missing + description = None if update_desc and not (person.get("description") or "").strip(): - if entity_id: - if kind == "character": - details = resolver.get_character_details(entity_id) - if details: - description = _build_character_description(details) or None - else: - details = resolver.get_person_details(entity_id) - if details: - description = _build_person_description(details) or None + description = _build_character_description(details) or None + need_desc = bool(description) - needs_desc = bool(description) - - # ------ Metadata update ------------------------------------------ changed = False - if needs_mal_id or needs_al_id or needs_desc: - payload: dict = { + if need_id or need_desc: + payload = { "id": person_id, - "name": person_name, - # MUST stay a boolean — the cover image itself is uploaded - # separately via POST /api/Upload/person (below). Putting a - # URL here makes Kavita reject the whole payload with HTTP 400. + "name": person.get("name") or "", + # MUST stay a boolean — the cover is uploaded separately. "coverImageLocked": bool(person.get("coverImageLocked", False)), "aliases": person.get("aliases") or [], "description": description or person.get("description"), - "malId": mal_id if needs_mal_id else (current_mal_id or None), - "aniListId": al_id if needs_al_id else (current_al_id or None), + "malId": tracker_id if source == "mal" + else (person.get("malId") or None), + "aniListId": tracker_id if source == "al" + else (person.get("aniListId") or None), } try: - self._client.update_person(payload) + with item.measure("person_update"): + self._client.update_person(payload) changed = True - except requests.RequestException as e: - if errors is not None: - errors.append( - f"Person/update failed for #{person_id} " - f"'{person_name}': {e}") + except requests.RequestException as exc: + errors.append(f"update failed #{person_id} " + f"'{person.get('name')}': {exc}") - # ------ Cover image upload ---------------------------------------- - # Upload whenever: - # - caller requested cover updates - # - cover is NOT locked (user did not manually pin it) - # - we have not already uploaded this exact tracker entity's image - # (i.e. the tracked ID differs OR there is no cover yet). + # Cover: upload when not locked and not already set for this id. if update_cover and not person.get("coverImageLocked"): - image_url = mal_entry.get("image_url") - already_uploaded = ( - entity_id is not None - and (current_mal_id == mal_id or current_al_id == al_id) - and bool(person.get("coverImage")) - ) - if image_url and not already_uploaded: + image_url = details.get("image_url") + already = bool(current) and bool(person.get("coverImage")) + if image_url and not already: try: - self._client.upload_person_cover(person_id, image_url) + with item.measure("cover_upload"): + self._client.upload_person_cover(person_id, image_url) changed = True - except requests.RequestException as e: - if errors is not None: - errors.append( - f"cover upload failed for #{person_id} " - f"'{person_name}' ({image_url}): {e}") + except requests.RequestException as exc: + errors.append(f"cover upload failed #{person_id} " + f"'{person.get('name')}': {exc}") - return changed + return "updated" if changed else "skipped" # -------------------------------------------------------------------------- -# Module helpers: description builders +# Module helper: character description builder # -------------------------------------------------------------------------- -def _format_birthday(birthday: str) -> str: - """Converts an ISO 8601 birthday string to "D Month YYYY".""" - if not birthday: - return "" - try: - dt = datetime.date.fromisoformat(birthday.split("T")[0]) - return f"{dt.day} {dt.strftime('%B %Y')}" - except (ValueError, AttributeError): - return "" - - def _build_character_description(details: dict) -> str: """ - Builds a Kavita-safe HTML description for a MAL character. + Builds a Kavita-safe HTML description for a MAL / AniList character. - Top line: "Favorites: N" as a link to the character's MAL page. + Top line: "Favorites: N" linked to the character page (when available). Remainder: the character's `about` text converted to HTML paragraphs. """ parts: list[str] = [] url = details.get("url") or "" favorites = details.get("favorites") if url and favorites is not None: - parts.append(f'

Favorites: {favorites:,}

') - about = (details.get("about") or "").strip() - if about: - parts.append(paragraphs_to_html(about)) - return "
".join(parts) - - -def _build_person_description(details: dict) -> str: - """ - Builds a Kavita-safe HTML description for a MAL person (mangaka / staff). - - Renders a summary table (given name, family name, birthday, website, - member favorites) followed by the `about` biography as HTML paragraphs. - """ - _TD = 'style="padding-right:1.5em"' - rows: list[str] = [] - - given = (details.get("given_name") or "").strip() - family = (details.get("family_name") or "").strip() - birthday = details.get("birthday") or "" - favorites = details.get("favorites") - website = (details.get("website_url") or "").strip() - url = (details.get("url") or "").strip() - - if given: - rows.append(f"Given name{given}") - if family: - rows.append(f"Family name{family}") - bday_str = _format_birthday(birthday) - if bday_str: - rows.append(f"Birthday{bday_str}") - if website: - rows.append( - f'Website' - f'{website}' - ) - if favorites is not None: - fav_cell = (f'{favorites:,}' if url - else f"{favorites:,}") - rows.append( - f"Member Favorites{fav_cell}") - - parts: list[str] = [] - if rows: - parts.append(f'{"".join(rows)}
') + parts.append(f'

' + f'Favorites: {favorites:,}

') about = (details.get("about") or "").strip() if about: parts.append(paragraphs_to_html(about)) @@ -418,18 +231,7 @@ if __name__ == "__main__": client = KavitaClient(os.environ["KAVITA_URL"], os.environ["KAVITA_API_KEY"]) updater = KavitaPersonUpdater(client) - - mal = MALResolver() - mal_id = mal.find_mal_id("よふかしのうた") - print("MAL ID:", mal_id) - - if mal_id: - result = updater.update_for_manga(mal_id) - print("Characters:", {k: v for k, v in result["characters"].items() - if k != "errors"}) - print("Staff :", {k: v for k, v in result["staff"].items() - if k != "errors"}) - # Surface any non-fatal upload / API errors for debugging - for section in ("characters", "staff"): - for err in result[section].get("errors", []): - print(f"[{section}] {err}") + report = updater.update_all_persons(trigger="cron") + print(report) + for err in report["errors"]: + print(" ", err) diff --git a/src/PerfStats.py b/src/PerfStats.py new file mode 100644 index 0000000..03b016d --- /dev/null +++ b/src/PerfStats.py @@ -0,0 +1,254 @@ +""" +perf_stats.py +============= + +Generic run/step performance profiler with JSON persistence, shared by the +move pipeline and the periodic updaters (volume/cover, persons). + +Each run is a tree of *items* (e.g. series -> chapter, or one person) and +every item carries named *step* timings. A run also carries free-form +``meta`` (e.g. the trigger source ``"cron" | "web" | "ln"`` for the person +updater). + +Data model (one entry per run, newest first):: + + { + "runs": [ + { + "runId": "…", + "startedAt": 1700000000, + "finishedAt": 1700000123, + "totalSeconds": 123.4, + "meta": {"trigger": "cron"}, + "itemCount": 2, # top-level items + "leafCount": 31, # items without children + "stepTotals": {"cover": 41.2, "image_dimensions": 55.8, ...}, + "items": [ + {"label": "Call of the Night", "totalSeconds": 60.2, "ok": true, + "steps": {"fetch_metadata": 1.2}, + "items": [ + {"label": "1", "totalSeconds": 11.5, "ok": true, + "steps": {"cover": 1.8, "pack_cbz": 2.9}, "items": []} + ]} + ] + } + ] + } + +Usage:: + + perf = PerfStats(path) # path=None -> disabled + run = perf.begin_run(meta={"trigger": "cron"}) + item = run.begin_item("Call of the Night") + with item.measure("fetch_metadata"): + ... + chap = item.begin_item("1") + with chap.measure("pack_cbz"): + ... + chap.finish() + item.finish() # flushes the run to disk + run.finish() + +When ``path`` is None every recorder is a no-op and nothing is written, so +the profiler can be left permanently wired in at negligible cost. The run +is flushed after every top-level item finishes, so a long run is observable +live and survives a crash mid-run. +""" + +from __future__ import annotations + +import json +import threading +import time +import uuid +from contextlib import contextmanager +from pathlib import Path + + +# Keep the JSON small: only the most recent runs are retained on disk. +_MAX_RUNS = 30 + + +class _StepTimer: + """ + Base recorder: accumulates ``{step_name: seconds}`` and tracks its own + wall-clock lifetime. ``enabled=False`` turns every method into a no-op. + """ + + def __init__(self, enabled: bool = True): + self.steps: dict[str, float] = {} + self._enabled = enabled + self._t0 = time.monotonic() + + @contextmanager + def measure(self, name: str): + """Context manager timing a named step (accumulates on repeat use).""" + if not self._enabled: + yield + return + start = time.monotonic() + try: + yield + finally: + self.steps[name] = round( + self.steps.get(name, 0.0) + (time.monotonic() - start), 4) + + def elapsed(self) -> float: + return round(time.monotonic() - self._t0, 4) + + +class ItemRecorder(_StepTimer): + """ + One node in a run's item tree. Has its own step timings and may contain + nested child items (e.g. a series item containing chapter items). + """ + + def __init__(self, run: "RunRecorder", label: str, *, + parent: "ItemRecorder | None" = None, + enabled: bool = True): + super().__init__(enabled) + self._run = run + self._label = label + self._parent = parent + self._children: list[dict] = [] + + def begin_item(self, label: str) -> "ItemRecorder": + return ItemRecorder(self._run, label, parent=self, + enabled=self._enabled) + + def finish(self, *, ok: bool = True) -> None: + if not self._enabled: + return + node = { + "label": self._label, + "totalSeconds": self.elapsed(), + "ok": ok, + "steps": self.steps, + "items": self._children, + } + if self._parent is not None: + self._parent._children.append(node) + else: + # Top-level item: attach to the run and persist progress. + self._run._items.append(node) + self._run.flush() + + +class RunRecorder: + """Top-level recorder for one full run.""" + + def __init__(self, stats: "PerfStats", meta: "dict | None" = None, + enabled: bool = True): + self._stats = stats + self._enabled = enabled + self._meta = meta or {} + self._items: list[dict] = [] + self._started = time.time() + self._t0 = time.monotonic() + self._run_id = uuid.uuid4().hex + + def begin_item(self, label: str) -> ItemRecorder: + return ItemRecorder(self, label, parent=None, enabled=self._enabled) + + def _snapshot(self) -> dict: + step_totals: dict[str, float] = {} + leaf_count = 0 + + def walk(node: dict) -> None: + nonlocal leaf_count + for step, secs in node["steps"].items(): + step_totals[step] = round(step_totals.get(step, 0.0) + secs, 4) + if node["items"]: + for child in node["items"]: + walk(child) + else: + leaf_count += 1 + + for item in self._items: + walk(item) + + return { + "runId": self._run_id, + "startedAt": round(self._started), + "finishedAt": round(time.time()), + "totalSeconds": round(time.monotonic() - self._t0, 4), + "meta": self._meta, + "itemCount": len(self._items), + "leafCount": leaf_count, + "stepTotals": step_totals, + "items": self._items, + } + + def flush(self) -> "dict | None": + """Writes the run's current state to disk (upsert by runId).""" + if not self._enabled: + return None + run = self._snapshot() + self._stats._upsert_run(run) + return run + + def finish(self) -> "dict | None": + """Persists the final run state. Returns the run dict.""" + return self.flush() + + +class PerfStats: + """ + Profiler facade + JSON persistence. + + Parameters + ---------- + path : Destination JSON file. None disables the profiler entirely + (every recorder becomes a no-op and nothing is written). + """ + + def __init__(self, path=None): + self._path = Path(path) if path else None + self._lock = threading.Lock() + + @property + def enabled(self) -> bool: + return self._path is not None + + def begin_run(self, meta: "dict | None" = None) -> RunRecorder: + return RunRecorder(self, meta=meta, enabled=self.enabled) + + # ------------------------------------------------------------------ + # Read / write + # ------------------------------------------------------------------ + def all(self) -> dict: + """Returns the persisted runs ({"runs": [...]}); newest first.""" + if not self._path or not self._path.is_file(): + return {"runs": []} + try: + with self._path.open("r", encoding="utf-8") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError): + return {"runs": []} + if not isinstance(data, dict) or not isinstance(data.get("runs"), list): + return {"runs": []} + return data + + def _upsert_run(self, run: dict) -> None: + """ + Inserts a new run (newest first) or replaces the existing entry with + the same runId — so incremental flushes during a run update one entry + rather than appending a duplicate after every item. + """ + if not self._path: + return + with self._lock: + runs = self.all()["runs"] + run_id = run.get("runId") + for i, existing in enumerate(runs): + if existing.get("runId") == run_id: + runs[i] = run + break + else: + runs.insert(0, run) # newest first + del runs[_MAX_RUNS:] # cap history + self._path.parent.mkdir(parents=True, exist_ok=True) + tmp = self._path.with_suffix(self._path.suffix + ".tmp") + with tmp.open("w", encoding="utf-8") as f: + json.dump({"runs": runs}, f, ensure_ascii=False, indent=2) + tmp.replace(self._path) diff --git a/src/PerfWebPage.py b/src/PerfWebPage.py new file mode 100644 index 0000000..619de73 --- /dev/null +++ b/src/PerfWebPage.py @@ -0,0 +1,160 @@ +""" +perf_web_page.py +================ + +Shared HTML page for browsing PerfStats output, used by both container web +UIs. ``render_perf_page(name, tabs)`` returns a standalone page that loads +``/api/perf/`` and renders each run's step totals plus the nested item +tree (series -> chapter, or one person, …) and the run trigger from meta. + +``tabs`` is a list of ``(label, name)`` pairs for cross-links between the +available perf datasets in that container. +""" + +from __future__ import annotations + + +_PERF_PAGE = """ + + + + __PERF_NAME__ performance + + + +

Performance: __PERF_NAME__ ◂ back

+
__PERF_TABS__
+
+ + + +
+ +
+ + + + +""" + + +def render_perf_page(name: str, tabs: "list[tuple[str, str]]") -> str: + """ + Returns the perf page HTML for dataset ``name``. + + tabs : list of (label, dataset_name) for the cross-link bar. + """ + tab_html = " ".join( + f'{label}' for label, n in tabs) + return (_PERF_PAGE + .replace("__PERF_TABS__", tab_html) + .replace("__PERF_NAME__", name)) diff --git a/src/TextUtils.py b/src/TextUtils.py index 2bfe7ec..2a5e50e 100644 --- a/src/TextUtils.py +++ b/src/TextUtils.py @@ -70,3 +70,30 @@ def person_name_with_id(name: str, *, if al_id: return f"{name} (AL {al_id})" return name + + +# Matches the suffix produced by person_name_with_id at the end of a name. +_TRACKER_ID_RE = re.compile(r"\s*\((MAL|AL)\s+(\d+)\)\s*$", re.IGNORECASE) + + +def parse_person_tracker_id(name: str) -> "tuple[str, int] | None": + """ + Inverse of person_name_with_id: extracts the tracker id from a + disambiguated Kavita person name. + + "Rem (MAL 118737)" -> ("mal", 118737) + "Subaru (AL 88311)" -> ("al", 88311) + "Kotoyama" -> None (no id suffix — e.g. an author/staff record) + + Returns ("mal" | "al", id) or None. + """ + if not name: + return None + m = _TRACKER_ID_RE.search(name) + if not m: + return None + source = "mal" if m.group(1).upper() == "MAL" else "al" + try: + return source, int(m.group(2)) + except ValueError: + return None diff --git a/src/ln/LightNovelOrchestrator.py b/src/ln/LightNovelOrchestrator.py index 339819e..f866490 100644 --- a/src/ln/LightNovelOrchestrator.py +++ b/src/ln/LightNovelOrchestrator.py @@ -192,14 +192,9 @@ class LightNovelOrchestrator: except Exception as exc: return {"ok": False, "error": f"series update failed: {exc}"} - # Persons - try: - person_report = self._person_updater.update_for_manga( - built.get("malId"), - al_manga_id=built.get("anilistId"), - ) - except Exception as exc: - person_report = {"error": str(exc)} + # Person sync no longer runs per series — it has its own global, + # id-based updater (sync_persons / KavitaPersonUpdater.update_all_persons) + # on a separate cron schedule. # Relationships + collection try: @@ -221,10 +216,20 @@ class LightNovelOrchestrator: "title": cached_title, "mangabakaId": built.get("mangabakaId"), "series": series_report, - "persons": person_report, "relationships": relation_report, } + # ------------------------------------------------------------------ + # Person sync (global, id-based — independent of series updates) + # ------------------------------------------------------------------ + def sync_persons(self, *, trigger: str = "ln", perf=None) -> dict: + """ + Runs the global, id-based person updater over every Kavita person. + Covers both manga and light-novel libraries in one pass. + """ + return self._person_updater.update_all_persons( + trigger=trigger, perf=perf) + def update_all(self, library_ids: "list[int] | None") -> dict: """Updates every cached series in the given libraries.""" if library_ids is None: diff --git a/src/ln/MatchesWebApp.py b/src/ln/MatchesWebApp.py index faa7caa..b57bf86 100644 --- a/src/ln/MatchesWebApp.py +++ b/src/ln/MatchesWebApp.py @@ -37,6 +37,10 @@ from flask import Flask, jsonify, request, Response from MatchesCache import MatchesCache from LightNovelMetadataBuilder import pick_thumbnail_url +from PerfWebPage import render_perf_page + +# Only the person dataset exists in the LN container. +_PERF_TABS = [("persons", "person")] def _int_list(values) -> list[int]: @@ -97,7 +101,9 @@ _INDEX_HTML = r""" + + Performance ▸ @@ -497,12 +503,25 @@ async function startUpdateAll() { } } +async function startSyncPersons() { + if (!confirm("Sync all Kavita persons against MAL/AniList? May take a while.")) return; + setStatus("Person sync started"); + try { + const r = await fetch("/api/persons/sync", { method: "POST" }); + if (!r.ok) throw new Error(await r.text()); + startPolling(); + } catch (err) { + setStatus("Person sync failed: " + err.message); + } +} + document.getElementById("filter").addEventListener("input", applyFilter); document.getElementById("libraries").addEventListener("change", applyFilter); document.getElementById("reload").addEventListener("click", load); document.getElementById("batchSave").addEventListener("click", batchSave); document.getElementById("build").addEventListener("click", startBuild); document.getElementById("updateAll").addEventListener("click", startUpdateAll); +document.getElementById("syncPersons").addEventListener("click", startSyncPersons); for (const th of document.querySelectorAll("th.sortable")) { th.addEventListener("click", () => { const col = th.dataset.col; @@ -581,11 +600,13 @@ class MatchesWebApp: def __init__(self, cache: MatchesCache, *, orchestrator=None, default_library_ids: "list[int] | None" = None, + person_perf=None, host: str = "0.0.0.0", port: int = 8080): self._cache = cache self._orchestrator = orchestrator self._defaults = list(default_library_ids or []) + self._person_perf = person_perf self._host = host self._port = port self._job = _JobState() @@ -757,8 +778,38 @@ class MatchesWebApp: return Response("a job is already running", status=409) return jsonify({"started": label}) + @app.post("/api/persons/sync") + def api_persons_sync(): + if self._orchestrator is None: + return Response("no orchestrator configured", status=503) + + def task(job: _JobState): + report = self._orchestrator.sync_persons( + trigger="ln", perf=self._person_perf) + job.append(f"updated={report['updated']} " + f"skipped={report['skipped']} " + f"not_found={report['not_found']} " + f"conflicts={report['conflicts']}") + for err in report.get("errors", []): + job.append(f" {err}") + + if not self._job.start("person sync", task): + return Response("a job is already running", status=409) + return jsonify({"started": "person sync"}) + @app.get("/api/status") def api_status(): snap = self._job.snapshot() snap["defaults"] = self._defaults return jsonify(snap) + + @app.get("/perf") + @app.get("/perf/") + def perf_page(name: str = "person") -> Response: + return Response(render_perf_page(name, _PERF_TABS), + mimetype="text/html; charset=utf-8") + + @app.get("/api/perf/") + def api_perf(name: str): + stats = self._person_perf if name == "person" else None + return jsonify(stats.all() if stats is not None else {"runs": []}) diff --git a/src/manga/KavitaVolumeCoverUpdater.py b/src/manga/KavitaVolumeCoverUpdater.py index aba084e..6273724 100644 --- a/src/manga/KavitaVolumeCoverUpdater.py +++ b/src/manga/KavitaVolumeCoverUpdater.py @@ -45,7 +45,6 @@ from __future__ import annotations import io import sys -import threading import xml.etree.ElementTree as ET import zipfile from datetime import datetime @@ -67,7 +66,7 @@ from MatchesCache import MatchesCache from SuwayomiMover import (_load_chapter_index, _save_chapter_index, _sanitize_dirname, _normalise_volume_value) from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit -from CronSchedule import CronSchedule +from PerfStats import PerfStats from CoverCache import CoverCache, _IMAGE_EXTS try: @@ -136,12 +135,12 @@ class KavitaVolumeCoverUpdater: request_timeout : HTTP timeout in seconds. log_path : File that receives one line per updated chapter. Default: /volume_updater.log - schedule : Cron expression (5 fields) defining when scans run, - e.g. "0 19 * * 1,4" = 19:00 every Monday and - Thursday. Evaluated in local time — set the TZ env - var inside Docker. Default: "0 19 * * 1,4". cover_cache_dir : Directory for the persistent cover cache. None -> temporary cache, deleted at process exit. + perf_stats : Optional PerfStats instance for per-step timing. + + Scheduling lives outside this class (see CronRunner); call update_all() + on whatever cadence you like. """ def __init__(self, @@ -152,8 +151,8 @@ class KavitaVolumeCoverUpdater: request_timeout: int = 30, api_base_url: str = "https://api.mangabaka.dev/v1", log_path=None, - schedule: str = "0 19 * * 1,4", - cover_cache_dir=None): + cover_cache_dir=None, + perf_stats: "PerfStats | None" = None): self._dst = Path(kavita_path) self._matches_cache = matches_cache self._language = language @@ -161,7 +160,7 @@ class KavitaVolumeCoverUpdater: self._api_base_url = api_base_url.rstrip("/") self._log_path = (Path(log_path) if log_path else self._dst / "volume_updater.log") - self._cron = CronSchedule(schedule) + self._perf = perf_stats or PerfStats(None) session = requests.Session() session.headers.setdefault("User-Agent", "KavitaVolumeCoverUpdater/1.0") @@ -178,51 +177,6 @@ class KavitaVolumeCoverUpdater: self._cover_cache = CoverCache( cover_cache_dir, session=session, request_timeout=request_timeout) - self._stop = threading.Event() - self._thread: "threading.Thread | None" = None - - # ------------------------------------------------------------------ - # Cron API (mirrors SuwayomiFolderWatcher) - # ------------------------------------------------------------------ - def start(self) -> None: - """Starts the periodic scan thread. Non-blocking.""" - if self._thread is not None and self._thread.is_alive(): - return - self._stop.clear() - self._thread = threading.Thread( - target=self._loop, name="KavitaVolumeCoverUpdater", daemon=True) - self._thread.start() - print(f"[{_now()}] [updater] scanning {self._dst} " - f"on cron '{self._cron.expression}'", flush=True) - - def stop(self) -> None: - """Stops the scan thread (current scan finishes its series first).""" - self._stop.set() - if self._thread is not None: - self._thread.join(timeout=10) - - def wait(self) -> None: - """Blocks the calling thread until stop() is invoked.""" - self._stop.wait() - - def _loop(self) -> None: - while not self._stop.is_set(): - next_run = self._cron.next_after(datetime.now()) - wait = max(0.0, (next_run - datetime.now()).total_seconds()) - print(f"[{_now()}] [updater] next scheduled scan: " - f"{next_run.isoformat(timespec='minutes')}", flush=True) - if self._stop.wait(wait): - break - - try: - summary = self.update_all() - print(f"[{_now()}] [updater] scan done: " - f"{summary['series_updated']} series / " - f"{summary['chapters_updated']} chapters updated", - flush=True) - except Exception as exc: - print(f"[{_now()}] [updater] scan ERROR: {exc}", flush=True) - # ------------------------------------------------------------------ # Public scan API # ------------------------------------------------------------------ @@ -243,23 +197,25 @@ class KavitaVolumeCoverUpdater: self._vol_resolver.clear_cache() self._works_resolver.clear_cache() - for series_dir in sorted(self._dst.iterdir()): - if self._stop.is_set(): - break - if not series_dir.is_dir(): - continue - summary["series_scanned"] += 1 - try: - updated = self.update_series(series_dir) - except Exception as exc: - print(f"[updater] {series_dir.name}: ERROR {exc}", flush=True) - continue - if updated: - summary["series_updated"] += 1 - summary["chapters_updated"] += updated + run = self._perf.begin_run() + try: + for series_dir in sorted(self._dst.iterdir()): + if not series_dir.is_dir(): + continue + summary["series_scanned"] += 1 + try: + updated = self.update_series(series_dir, run) + except Exception as exc: + print(f"[updater] {series_dir.name}: ERROR {exc}", flush=True) + continue + if updated: + summary["series_updated"] += 1 + summary["chapters_updated"] += updated + finally: + run.finish() return summary - def update_series(self, series_dir: Path) -> int: + def update_series(self, series_dir: Path, run=None) -> int: """ Updates one series folder. Returns the number of updated chapters. @@ -300,20 +256,24 @@ class KavitaVolumeCoverUpdater: md = builder.fetch_metadata() series_id = str(md.get("id") or "") + series_rec = (run or self._perf.begin_run()).begin_item(series_dir.name) + # Resolve volumes for all null-volume chapters first (API only). updates: dict[str, dict] = {} # num -> {"volume": str, "cover": tuple|None} - for num in sorted(missing, key=_chapter_sort_value): - builder.chapter = num - try: - volume = builder._determine_volume() - except Exception: - volume = None - if not volume: - continue - updates[num] = {"volume": volume, - "cover": self._fetch_cover(series_id, volume)} + with series_rec.measure("resolve_volumes"): + for num in sorted(missing, key=_chapter_sort_value): + builder.chapter = num + try: + volume = builder._determine_volume() + except Exception: + volume = None + if not volume: + continue + updates[num] = {"volume": volume, + "cover": self._fetch_cover(series_id, volume)} if not updates: + series_rec.finish(ok=True) return 0 first = min(chapters, key=_chapter_sort_value) @@ -328,10 +288,13 @@ class KavitaVolumeCoverUpdater: continue # The first chapter gets a full metadata rebuild (Kavita reads # series metadata from it); other chapters only a volume edit. - ok, cover_swapped = self._apply_update( - cbz, builder, num, - volume=up["volume"], cover=up["cover"], - full_rebuild=(num == first)) + chap_rec = series_rec.begin_item(num) + with chap_rec.measure("archive_rewrite"): + ok, cover_swapped = self._apply_update( + cbz, builder, num, + volume=up["volume"], cover=up["cover"], + full_rebuild=(num == first)) + chap_rec.finish(ok=ok) if not ok: continue entry["volume"] = _normalise_volume_value(up["volume"]) @@ -346,15 +309,19 @@ class KavitaVolumeCoverUpdater: first_entry = chapters.get(first) or {} cbz = series_dir / (first_entry.get("archiveName") or "") if first_entry.get("archiveName") and cbz.is_file(): - ok, _ = self._apply_update( - cbz, builder, first, - volume=None, cover=None, full_rebuild=True) + chap_rec = series_rec.begin_item(f"{first} (refresh)") + with chap_rec.measure("archive_rewrite"): + ok, _ = self._apply_update( + cbz, builder, first, + volume=None, cover=None, full_rebuild=True) + chap_rec.finish(ok=ok) if ok: self._log(f"{series_dir.name} | chapter {first} | " f"first-chapter metadata refreshed | {cbz.name}") if updated: _save_chapter_index(series_dir, index) + series_rec.finish(ok=True) return updated # ------------------------------------------------------------------ @@ -545,10 +512,7 @@ if __name__ == "__main__": matches_cache=MatchesCache(MATCHES_PATH), ) - # One-shot scan (no cron thread): + # One-shot scan. Scheduling is handled externally via CronRunner + # (see main_manga.py). summary = updater.update_all() print(f"\n[updater] {summary}") - - # Or run on the cron schedule (default: 19:00 every Mon + Thu): - # updater.start() - # updater.wait() diff --git a/src/manga/MatchesWebApp.py b/src/manga/MatchesWebApp.py index a90639c..748f8c2 100644 --- a/src/manga/MatchesWebApp.py +++ b/src/manga/MatchesWebApp.py @@ -30,6 +30,10 @@ from flask import Flask, jsonify, request, Response from MatchesCache import MatchesCache from ComicInfoBuilder import _pick_thumbnail_url +from PerfWebPage import render_perf_page + +# Cross-link tabs shown on every perf page in the manga container. +_PERF_TABS = [("move", "move"), ("volume/cover", "volume"), ("persons", "person")] _INDEX_HTML = """ @@ -71,7 +75,8 @@ _INDEX_HTML = """ - Performance ▸ + + Performance ▸ @@ -342,6 +347,23 @@ document.getElementById("move").addEventListener("click", async () => { btn.disabled = false; } }); +document.getElementById("syncPersons").addEventListener("click", async () => { + if (!confirm("Sync all Kavita persons against MAL/AniList? May take a while.")) return; + const btn = document.getElementById("syncPersons"); + btn.disabled = true; + setStatus("Syncing persons… (running on the server)"); + try { + const r = await fetch("/api/persons/sync", { method: "POST" }); + if (!r.ok) throw new Error(await r.text()); + const d = await r.json(); + setStatus("Persons: " + d.updated + " updated, " + d.skipped + " skipped, " + + d.not_found + " not found, " + d.conflicts + " conflicts"); + } catch (err) { + setStatus("Person sync failed: " + err.message); + } finally { + btn.disabled = false; + } +}); for (const th of document.querySelectorAll("th.sortable")) { th.addEventListener("click", () => { const col = th.dataset.col; @@ -358,133 +380,6 @@ load(); """ -_PERF_HTML = """ - - - - Move performance - - - -

Move performance ◂ back to matches

-
- - - -
- -
- - - - -""" class MatchesWebApp: @@ -497,16 +392,22 @@ class MatchesWebApp: def __init__(self, cache: MatchesCache, *, mover=None, + person_updater=None, + person_trigger: str = "web", perf_stats=None, host: str = "0.0.0.0", port: int = 8080): self._cache = cache self._mover = mover - self._perf = perf_stats + self._person_updater = person_updater + self._person_trigger = person_trigger + # perf_stats: dict {name -> PerfStats}, e.g. {"move", "volume", "person"}. + self._perf = perf_stats or {} self._host = host self._port = port self._build_lock = threading.Lock() self._move_lock = threading.Lock() + self._person_lock = threading.Lock() self._app = Flask(__name__) self._thread: "threading.Thread | None" = None self._register_routes() @@ -631,12 +532,30 @@ class MatchesWebApp: self._move_lock.release() return jsonify({"results": results}) - @app.get("/perf") - def perf_page() -> Response: - return Response(_PERF_HTML, mimetype="text/html; charset=utf-8") + @app.post("/api/persons/sync") + def api_persons_sync(): + if self._person_updater is None: + return Response("no person updater configured", status=503) + if not self._person_lock.acquire(blocking=False): + return Response("person sync already running", status=409) + try: + report = self._person_updater.update_all_persons( + trigger=self._person_trigger, + perf=self._perf.get("person")) + except Exception as exc: + return Response(f"person sync failed: {exc}", status=500) + finally: + self._person_lock.release() + return jsonify(report) - @app.get("/api/perf") - def api_perf(): - if self._perf is None: - return jsonify({"runs": []}) - return jsonify(self._perf.all()) + # Perf pages: /perf (move) + /perf/ for the updaters. + @app.get("/perf") + @app.get("/perf/") + def perf_page(name: str = "move") -> Response: + return Response(render_perf_page(name, _PERF_TABS), + mimetype="text/html; charset=utf-8") + + @app.get("/api/perf/") + def api_perf(name: str): + stats = self._perf.get(name) + return jsonify(stats.all() if stats is not None else {"runs": []}) diff --git a/src/manga/PerfStats.py b/src/manga/PerfStats.py deleted file mode 100644 index 8b70d33..0000000 --- a/src/manga/PerfStats.py +++ /dev/null @@ -1,267 +0,0 @@ -""" -perf_stats.py -============= - -Lightweight performance profiler for the Suwayomi -> Kavita move pipeline. - -It records, per move run, how long each step of every chapter takes plus -per-series and per-run totals, so a slowdown can be traced to the step -responsible (cover download, image-dimension probing, CBZ packing, …). - -Data model (one entry per run, newest first):: - - { - "runs": [ - { - "startedAt": 1700000000, # unix seconds - "finishedAt": 1700000123, - "totalSeconds": 123.4, # wall clock of the whole run - "seriesCount": 2, - "chapterCount": 31, - "stepTotals": { # summed over ALL chapters - "cover": 41.2, "image_dimensions": 55.8, "pack_cbz": 18.1, ... - }, - "seriesStepTotals": { # summed over ALL series - "fetch_metadata": 2.4, "person_sync": 9.7 - }, - "series": [ - { - "title": "Call of the Night", - "totalSeconds": 60.2, - "chapterCount": 20, - "steps": {"fetch_metadata": 1.2, "person_sync": 3.4}, - "chapters": [ - {"chapter": "1", "ok": true, "totalSeconds": 11.5, - "steps": {"cover": 1.8, "image_dimensions": 4.2, ...}} - ] - } - ] - } - ] - } - -Usage from the mover:: - - perf = PerfStats(path) # path=None -> disabled (no-op) - run = perf.begin_run() - series = run.begin_series("Title") - with series.measure("fetch_metadata"): - ... - chap = series.begin_chapter("1") - with chap.measure("pack_cbz"): - ... - chap.finish(ok=True) - series.finish() - run.finish() # persists the run to disk - -When ``path`` is None every recorder is a no-op and nothing is written, -so the profiler can be left permanently wired in with negligible cost. -""" - -from __future__ import annotations - -import json -import threading -import time -import uuid -from contextlib import contextmanager -from pathlib import Path - - -# Keep the JSON small: only the most recent runs are retained on disk. -_MAX_RUNS = 30 - - -class _StepTimer: - """ - Base recorder: accumulates ``{step_name: seconds}`` and tracks its own - wall-clock lifetime. ``enabled=False`` turns every method into a no-op. - """ - - def __init__(self, enabled: bool = True): - self.steps: dict[str, float] = {} - self._enabled = enabled - self._t0 = time.monotonic() - - @contextmanager - def measure(self, name: str): - """Context manager timing a named step (accumulates on repeat use).""" - if not self._enabled: - yield - return - start = time.monotonic() - try: - yield - finally: - self.steps[name] = round( - self.steps.get(name, 0.0) + (time.monotonic() - start), 4) - - def elapsed(self) -> float: - return round(time.monotonic() - self._t0, 4) - - -class ChapterRecorder(_StepTimer): - """Per-chapter step timer.""" - - def __init__(self, series: "SeriesRecorder", chapter: str, - enabled: bool = True): - super().__init__(enabled) - self._series = series - self._chapter = chapter - self._ok = True - - def finish(self, *, ok: bool = True) -> None: - self._ok = ok - if not self._enabled: - return - self._series._chapters.append({ - "chapter": self._chapter, - "ok": ok, - "totalSeconds": self.elapsed(), - "steps": self.steps, - }) - - -class SeriesRecorder(_StepTimer): - """Per-series step timer; also collects its chapters.""" - - def __init__(self, run: "RunRecorder", title: str, enabled: bool = True): - super().__init__(enabled) - self._run = run - self._title = title - self._chapters: list[dict] = [] - - def begin_chapter(self, chapter: str) -> ChapterRecorder: - return ChapterRecorder(self, chapter, enabled=self._enabled) - - def finish(self) -> None: - if not self._enabled: - return - self._run._series.append({ - "title": self._title, - "totalSeconds": self.elapsed(), - "chapterCount": len(self._chapters), - "steps": self.steps, - "chapters": self._chapters, - }) - # Persist the run's progress after every series so a long run is - # observable live and survives a crash mid-run. - self._run.flush() - - -class RunRecorder: - """Top-level recorder for one full move run.""" - - def __init__(self, stats: "PerfStats", enabled: bool = True): - self._stats = stats - self._enabled = enabled - self._series: list[dict] = [] - self._started = time.time() - self._t0 = time.monotonic() - # Stable identity so incremental flushes update the same run entry - # instead of inserting a duplicate on every series. - self._run_id = uuid.uuid4().hex - - def begin_series(self, title: str) -> SeriesRecorder: - return SeriesRecorder(self, title, enabled=self._enabled) - - def _snapshot(self) -> dict: - """Aggregates the run's current state into a serialisable dict.""" - step_totals: dict[str, float] = {} - series_step_totals: dict[str, float] = {} - chapter_count = 0 - for s in self._series: - for step, secs in s["steps"].items(): - series_step_totals[step] = round( - series_step_totals.get(step, 0.0) + secs, 4) - for ch in s["chapters"]: - chapter_count += 1 - for step, secs in ch["steps"].items(): - step_totals[step] = round( - step_totals.get(step, 0.0) + secs, 4) - - return { - "runId": self._run_id, - "startedAt": round(self._started), - "finishedAt": round(time.time()), - "totalSeconds": round(time.monotonic() - self._t0, 4), - "seriesCount": len(self._series), - "chapterCount": chapter_count, - "stepTotals": step_totals, - "seriesStepTotals": series_step_totals, - "series": self._series, - } - - def flush(self) -> dict | None: - """Writes the run's current state to disk (upsert by runId).""" - if not self._enabled: - return None - run = self._snapshot() - self._stats._upsert_run(run) - return run - - def finish(self) -> dict | None: - """Persists the final run state. Returns the run dict.""" - return self.flush() - - -class PerfStats: - """ - Profiler facade + JSON persistence. - - Parameters - ---------- - path : Destination JSON file. None disables the profiler entirely - (every recorder becomes a no-op and nothing is written). - """ - - def __init__(self, path=None): - self._path = Path(path) if path else None - self._lock = threading.Lock() - - @property - def enabled(self) -> bool: - return self._path is not None - - def begin_run(self) -> RunRecorder: - return RunRecorder(self, enabled=self.enabled) - - # ------------------------------------------------------------------ - # Read / write - # ------------------------------------------------------------------ - def all(self) -> dict: - """Returns the persisted runs ({"runs": [...]}); newest first.""" - if not self._path or not self._path.is_file(): - return {"runs": []} - try: - with self._path.open("r", encoding="utf-8") as f: - data = json.load(f) - except (OSError, json.JSONDecodeError): - return {"runs": []} - if not isinstance(data, dict) or not isinstance(data.get("runs"), list): - return {"runs": []} - return data - - def _upsert_run(self, run: dict) -> None: - """ - Inserts a new run (newest first) or replaces the existing entry with - the same runId — so incremental flushes during a run update one entry - rather than appending a duplicate after every series. - """ - if not self._path: - return - with self._lock: - runs = self.all()["runs"] - run_id = run.get("runId") - for i, existing in enumerate(runs): - if existing.get("runId") == run_id: - runs[i] = run - break - else: - runs.insert(0, run) # newest first - del runs[_MAX_RUNS:] # cap history - self._path.parent.mkdir(parents=True, exist_ok=True) - tmp = self._path.with_suffix(self._path.suffix + ".tmp") - with tmp.open("w", encoding="utf-8") as f: - json.dump({"runs": runs}, f, ensure_ascii=False, indent=2) - tmp.replace(self._path) diff --git a/src/manga/SuwayomiMover.py b/src/manga/SuwayomiMover.py index 1c35f73..5cd31ad 100644 --- a/src/manga/SuwayomiMover.py +++ b/src/manga/SuwayomiMover.py @@ -64,8 +64,6 @@ from MangadexVolumeResolver import MangaDexVolumeResolver from MangaBakaWorksResolver import MangaBakaWorksResolver from MALResolver import MALResolver from AniListResolver import AniListResolver -from KavitaClient import KavitaClient -from KavitaPersonUpdater import KavitaPersonUpdater from MatchesCache import MatchesCache from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit from CoverCache import CoverCache, _IMAGE_EXTS @@ -306,9 +304,6 @@ class SuwayomiMover: Expected layout: ///<Chapter N>/ kavita_path : Root of the Kavita library. Series sub-directories are created automatically. - kavita_base_url : Kavita server URL — required only for person sync, - e.g. "http://192.168.2.2:5000". - kavita_api_key : Kavita API key — required only for person sync. language : ComicInfo LanguageISO and SeriesSort language ("en"). request_timeout : HTTP timeout in seconds for all API / image requests. delete_source : Remove the source chapter folder after successful pack. @@ -316,14 +311,16 @@ class SuwayomiMover: temporary cache, deleted at process exit. perf_stats : Optional PerfStats instance for per-step timing. None (default) disables profiling. + + Note: Kavita person sync is no longer done here — it runs as a separate, + global, id-based updater on its own cron schedule (KavitaPersonUpdater). + The mover only touches MangaBaka / MangaDex / MAL / AniList. """ def __init__(self, suwayomi_path, kavita_path, *, - kavita_base_url: "str | None" = None, - kavita_api_key: "str | None" = None, language: str = "en", request_timeout: int = 30, delete_source: bool = True, @@ -357,16 +354,6 @@ class SuwayomiMover: self._cover_cache = CoverCache( cover_cache_dir, session=session, request_timeout=request_timeout) - self._person_updater: "KavitaPersonUpdater | None" = None - if kavita_base_url and kavita_api_key: - kavita_client = KavitaClient( - kavita_base_url, kavita_api_key, - request_timeout=request_timeout) - self._person_updater = KavitaPersonUpdater( - kavita_client, - mal_resolver=self._mal, - al_resolver=self._al) - # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ @@ -502,7 +489,7 @@ class SuwayomiMover: # ------------------------------------------------------------------ def _process_series_dir(self, manga_dir: Path, run=None) -> dict: manga_title = manga_dir.name - series_rec = (run or self._perf.begin_run()).begin_series(manga_title) + series_rec = (run or self._perf.begin_run()).begin_item(manga_title) chapter_dirs = sorted( (d for d in manga_dir.iterdir() if d.is_dir()), @@ -597,27 +584,11 @@ class SuwayomiMover: } _save_chapter_index(dest_series, chapter_index) - # Sync Kavita persons once per series. - # Both MAL and AniList IDs come from MangaBaka's source map; - # AniList is used as fallback when MAL returns no characters/staff. - person_result: "dict | None" = None - if self._person_updater: - mal_id = ((ComicInfoBuilder._mal_id_from_source(md) if md else None) - or self._mal.find_mal_id(builder_title)) - al_id = ComicInfoBuilder._al_id_from_source(md) if md else None - if mal_id or al_id: - try: - with series_rec.measure("person_sync"): - person_result = self._person_updater.update_for_manga( - mal_id, al_manga_id=al_id) - print(f" Persons: chars={person_result['characters'].get('updated')} " - f"staff={person_result['staff'].get('updated')}") - except Exception as exc: - person_result = {"error": str(exc)} - print(f" Persons: ERROR {exc}") - + # Person sync no longer runs here — it has its own global, + # id-based updater on a separate cron schedule (see + # KavitaPersonUpdater.update_all_persons). series_rec.finish() - return {"chapters": chapter_results, "persons": person_result} + return {"chapters": chapter_results} # ------------------------------------------------------------------ # Internal: chapter @@ -637,8 +608,8 @@ class SuwayomiMover: <Pages> element correctly points to the front cover). """ cbz_path = dest_series / f"{chapter_dir.name}.cbz" - chap_rec = (series_rec or self._perf.begin_run().begin_series("") - ).begin_chapter(chapter_num) + chap_rec = (series_rec or self._perf.begin_run().begin_item("") + ).begin_item(chapter_num) # add_pages_from_folder records its own sub-steps on this recorder. builder.perf = chap_rec ok = False @@ -675,14 +646,9 @@ class SuwayomiMover: # Usage example # -------------------------------------------------------------------------- if __name__ == "__main__": - import os - # Local (no-Docker) smoke test. Adjust paths to your environment. - # Set the KAVITA_API_KEY env var — never commit API keys to the repo. SUWAYOMI_PATH = r"M:\config\downloads\mangas" KAVITA_PATH = r"\\192.168.2.2\root\ServerData\Kavita\test" - KAVITA_URL = "http://192.168.2.2:5000" - KAVITA_KEY = os.environ.get("KAVITA_API_KEY", "") # matches.json lives next to this script during local testing. MATCHES_PATH = Path(__file__).resolve().parent.parent / "matches.json" @@ -691,8 +657,6 @@ if __name__ == "__main__": mover = SuwayomiMover( SUWAYOMI_PATH, KAVITA_PATH, - kavita_base_url=KAVITA_URL, - kavita_api_key=KAVITA_KEY, delete_source=False, matches_cache=matches_cache, )