From 615bd1b468ce27f2b02528a665a632258eb363d5 Mon Sep 17 00:00:00 2001 From: JohannesBOT Date: Tue, 26 May 2026 20:20:24 +0200 Subject: [PATCH] manga matching and WebApp --- .gitea/workflows/build.yml | 80 ++++----- Dockerfile | 4 +- docker-compose.prod.yml | 5 + main.py | 35 ++-- requirements.txt | 1 + src/ComicInfoBuilder.py | 38 ++++- src/MatchesCache.py | 139 +++++++++++++++ src/MatchesWebApp.py | 341 +++++++++++++++++++++++++++++++++++++ src/SuwayomiMover.py | 78 ++++++++- 9 files changed, 665 insertions(+), 56 deletions(-) create mode 100644 src/MatchesCache.py create mode 100644 src/MatchesWebApp.py diff --git a/.gitea/workflows/build.yml b/.gitea/workflows/build.yml index c455253..845e445 100644 --- a/.gitea/workflows/build.yml +++ b/.gitea/workflows/build.yml @@ -23,43 +23,43 @@ jobs: - name: Push Image run: docker push gitea.johannesbot.de/johannesbot/manga-mover-and-metadata-collector:latest -# deploy: -# needs: build -# runs-on: ubuntu-latest -# steps: -# - name: Checkout -# uses: actions/checkout@v4 -# -# - name: Create deployment directory -# uses: appleboy/ssh-action@v1.0.3 -# with: -# host: ${{ secrets.SSH_HOST }} -# username: ${{ secrets.SSH_USER }} -# password: ${{ secrets.SSH_PASSWORD }} -# port: ${{ secrets.SSH_PORT || 22 }} -# script: mkdir -p /home/${{ secrets.SSH_USER }}/manga-mover-and-metadata-collector -# -# - name: Copy docker-compose via SCP -# uses: appleboy/scp-action@v0.1.7 -# with: -# host: ${{ secrets.SSH_HOST }} -# username: ${{ secrets.SSH_USER }} -# password: ${{ secrets.SSH_PASSWORD }} -# port: ${{ secrets.SSH_PORT || 22 }} -# source: "docker-compose.prod.yml" -# target: "/home/${{ secrets.SSH_USER }}/manga-mover-and-metadata-collector" -# -# - name: Deploy via SSH -# uses: appleboy/ssh-action@v1.0.3 -# with: -# host: ${{ secrets.SSH_HOST }} -# username: ${{ secrets.SSH_USER }} -# password: ${{ secrets.SSH_PASSWORD }} -# port: ${{ secrets.SSH_PORT || 22 }} -# script: | -# cd /home/${{ secrets.SSH_USER }}/manga-mover-and-metadata-collector -# mv docker-compose.prod.yml docker-compose.yml -# echo "${{ secrets.REGISTRY_PASSWORD }}" | sudo docker login https://gitea.johannesbot.de -u ${{ secrets.REGISTRY_USER }} --password-stdin -# sudo docker compose pull -# sudo docker compose up -d --remove-orphans -# sudo docker image prune -f + deploy: + needs: build + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Create deployment directory + uses: appleboy/ssh-action@v1.0.3 + with: + host: ${{ secrets.SSH_HOST }} + username: ${{ secrets.SSH_USER }} + password: ${{ secrets.SSH_PASSWORD }} + port: ${{ secrets.SSH_PORT || 22 }} + script: mkdir -p /home/${{ secrets.SSH_USER }}/manga-mover-and-metadata-collector + + - name: Copy docker-compose via SCP + uses: appleboy/scp-action@v0.1.7 + with: + host: ${{ secrets.SSH_HOST }} + username: ${{ secrets.SSH_USER }} + password: ${{ secrets.SSH_PASSWORD }} + port: ${{ secrets.SSH_PORT || 22 }} + source: "docker-compose.prod.yml" + target: "/home/${{ secrets.SSH_USER }}/manga-mover-and-metadata-collector" + + - name: Deploy via SSH + uses: appleboy/ssh-action@v1.0.3 + with: + host: ${{ secrets.SSH_HOST }} + username: ${{ secrets.SSH_USER }} + password: ${{ secrets.SSH_PASSWORD }} + port: ${{ secrets.SSH_PORT || 22 }} + script: | + cd /home/${{ secrets.SSH_USER }}/manga-mover-and-metadata-collector + mv docker-compose.prod.yml docker-compose.yml + echo "${{ secrets.REGISTRY_PASSWORD }}" | sudo docker login https://gitea.johannesbot.de -u ${{ secrets.REGISTRY_USER }} --password-stdin + sudo docker compose pull + sudo docker compose up -d --remove-orphans + sudo docker image prune -f diff --git a/Dockerfile b/Dockerfile index ef34f96..496600a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,8 @@ ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 # Mount points used by main.py defaults -VOLUME ["/mnt/suwayomi", "/mnt/kavita"] +VOLUME ["/mnt/suwayomi", "/mnt/kavita", "/config"] + +EXPOSE 8080 CMD ["python", "/app/main.py"] diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 8362941..df040e6 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -9,6 +9,11 @@ services: LANGUAGE: "${LANGUAGE:-en}" SETTLE_SECONDS: "${SETTLE_SECONDS:-600}" DELETE_SOURCE: "${DELETE_SOURCE:-true}" + MATCH_PATH: "${MATCH_PATH:-/config/matches.json}" + WEB_PORT: "${WEB_PORT:-8080}" + ports: + - "${WEB_PORT:-8080}:8080" volumes: - "${HOST_SUWAYOMI_PATH}:/mnt/suwayomi" - "${HOST_KAVITA_PATH}:/mnt/kavita" + - "${HOST_CONFIG_PATH}:/config" diff --git a/main.py b/main.py index bd271cc..6cdbd2f 100644 --- a/main.py +++ b/main.py @@ -24,6 +24,9 @@ Environment variables SETTLE_SECONDS default 600 (10-minute quiet window) REQUEST_TIMEOUT default 30 DELETE_SOURCE default true (delete source folders after pack) + MATCH_PATH default /config/matches.json + WEB_PORT default 8080 (Flask web UI for matches.json) + WEB_HOST default 0.0.0.0 """ from __future__ import annotations @@ -38,6 +41,8 @@ sys.path.insert(0, str(Path(__file__).resolve().parent / "src")) from src.SuwayomiMover import SuwayomiMover # noqa: E402 from src.SuwayomiFolderWatcher import SuwayomiFolderWatcher # noqa: E402 +from src.MatchesCache import MatchesCache # noqa: E402 +from src.MatchesWebApp import MatchesWebApp # noqa: E402 def _env_str(name: str, default: "str | None" = None, @@ -77,6 +82,9 @@ def main() -> int: settle_seconds = _env_int("SETTLE_SECONDS", 600) request_timeout = _env_int("REQUEST_TIMEOUT", 30) delete_source = _env_bool("DELETE_SOURCE", True) + match_path = _env_str("MATCH_PATH", "/config/matches.json") + web_host = _env_str("WEB_HOST", "0.0.0.0") or "0.0.0.0" + web_port = _env_int("WEB_PORT", 8080) print(f"[main] suwayomi = {suwayomi_path}", flush=True) print(f"[main] kavita = {kavita_path}", flush=True) @@ -84,6 +92,10 @@ def main() -> int: print(f"[main] settle = {settle_seconds}s", flush=True) print(f"[main] language = {language}", flush=True) print(f"[main] delete src= {delete_source}", flush=True) + print(f"[main] match path= {match_path}", flush=True) + print(f"[main] web = {web_host}:{web_port}", flush=True) + + matches_cache = MatchesCache(match_path) mover = SuwayomiMover( suwayomi_path, kavita_path, @@ -92,20 +104,23 @@ def main() -> int: language=language, request_timeout=request_timeout, delete_source=delete_source, + matches_cache=matches_cache, ) - watcher = SuwayomiFolderWatcher( - suwayomi_path, mover, settle_seconds=settle_seconds) + # watcher = SuwayomiFolderWatcher(suwayomi_path, mover, settle_seconds=settle_seconds) - def shutdown(signum, _frame): - print(f"[main] received signal {signum}", flush=True) - watcher.stop() + web_app = MatchesWebApp(matches_cache, mover=mover, host=web_host, port=web_port) + web_app.start() - signal.signal(signal.SIGTERM, shutdown) - signal.signal(signal.SIGINT, shutdown) - - watcher.start() - watcher.wait() # blocks until stop() is called via a signal + # def shutdown(signum, _frame): + # print(f"[main] received signal {signum}", flush=True) + # watcher.stop() + # + # signal.signal(signal.SIGTERM, shutdown) + # signal.signal(signal.SIGINT, shutdown) + # + # watcher.start() + # watcher.wait() # blocks until stop() is called via a signal return 0 diff --git a/requirements.txt b/requirements.txt index e6aa695..01b6bf7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ requests>=2.31 Pillow>=10.0 watchdog>=4.0 +Flask>=3.0 diff --git a/src/ComicInfoBuilder.py b/src/ComicInfoBuilder.py index 0da1488..8fd049f 100644 --- a/src/ComicInfoBuilder.py +++ b/src/ComicInfoBuilder.py @@ -48,6 +48,7 @@ from MangadexVolumeResolver import MangaDexVolumeResolver from MangaBakaWorksResolver import MangaBakaWorksResolver from MALResolver import MALResolver from AniListResolver import AniListResolver +from MatchesCache import MatchesCache try: from PIL import Image @@ -170,7 +171,8 @@ class ComicInfoBuilder: volume_resolver: "MangaDexVolumeResolver | None" = None, works_resolver: "MangaBakaWorksResolver | None" = None, mal_resolver: "MALResolver | None" = None, - al_resolver: "AniListResolver | None" = None): + al_resolver: "AniListResolver | None" = None, + matches_cache: "MatchesCache | None" = None): if not manga_title or not str(manga_title).strip(): raise ValueError("manga_title must not be empty.") @@ -197,6 +199,7 @@ class ComicInfoBuilder: request_timeout=request_timeout) self._al_resolver = al_resolver or AniListResolver( request_timeout=request_timeout) + self._matches_cache = matches_cache self._metadata: "dict | None" = None self._pages: list[dict] = [] @@ -353,14 +356,43 @@ class ComicInfoBuilder: return series def _search_best_series(self, title: str): - """Searches for `title` and returns the best matching series entry.""" + """ + Resolves `title` to a MangaBaka series. + + Lookup order: + 1. matches.json cache (if attached) — uses the stored series ID + to fetch the full series, skipping the search step entirely. + 2. Fresh MangaBaka search — top hit. The match is persisted to + matches.json before being returned so it survives a crash. + """ + if self._matches_cache is not None: + cached = self._matches_cache.get(title) + if cached and cached.get("mangabakaId"): + try: + return self._fetch_series_by_id(cached["mangabakaId"]) + except Exception as exc: + print(f"[ComicInfoBuilder] cached id " + f"{cached['mangabakaId']} for {title!r} failed " + f"({exc}); falling back to fresh search", + flush=True) + url = f"{self.api_base_url}/series/search" resp = self._session.get( url, params={"q": title, "page": 1, "limit": 1}, timeout=self.request_timeout) resp.raise_for_status() data = resp.json().get("data") or [] - return data[0] if data else None + series = data[0] if data else None + + if series and self._matches_cache is not None: + self._matches_cache.add( + title, + mangabaka_id=series.get("id"), + mangabaka_name=series.get("title") or "", + image_url=_pick_cover_url(series.get("cover")), + ) + + return series def _fetch_series_by_id(self, series_id) -> dict: url = f"{self.api_base_url}/series/{series_id}" diff --git a/src/MatchesCache.py b/src/MatchesCache.py new file mode 100644 index 0000000..b225ec4 --- /dev/null +++ b/src/MatchesCache.py @@ -0,0 +1,139 @@ +""" +matches_cache.py +================ + +Persistent JSON cache that maps a Suwayomi/series search title to the +MangaBaka series it was matched against. + +Structure on disk:: + + { + "matches": { + "": { + "mangabakaId": "12345", + "mangabakaName": "One-Punch Man", + "imageUrl": "https://.../cover.jpg", + "firstMatchTime": 1700000000 + }, + ... + } + } + +The cache is consulted by ComicInfoBuilder before issuing a MangaBaka +search request, and is written back to disk on every mutation so a crash +does not lose matches that were resolved in the current run. +""" + +from __future__ import annotations + +import json +import threading +import time +from pathlib import Path + + +class MatchesCache: + def __init__(self, path): + self._path = Path(path) + self._lock = threading.RLock() + self._data: dict = {"matches": {}} + self._load() + + # ------------------------------------------------------------------ + # Public lookup / mutation API + # ------------------------------------------------------------------ + def get(self, title: str) -> "dict | None": + with self._lock: + entry = self._data["matches"].get(title) + return dict(entry) if entry else None + + def add(self, title: str, *, + mangabaka_id, + mangabaka_name: str, + image_url: "str | None") -> dict: + entry = { + "mangabakaId": str(mangabaka_id) if mangabaka_id is not None else "", + "mangabakaName": mangabaka_name or "", + "imageUrl": image_url or "", + "firstMatchTime": int(time.time()), + } + with self._lock: + self._data["matches"][title] = entry + self._save_unlocked() + return dict(entry) + + def upsert(self, title: str, *, + mangabaka_id=None, + mangabaka_name=None, + image_url=None, + first_match_time=None) -> dict: + with self._lock: + entry = self._data["matches"].get(title) + if entry is None: + entry = { + "mangabakaId": "", + "mangabakaName": "", + "imageUrl": "", + "firstMatchTime": int(time.time()), + } + self._data["matches"][title] = entry + if mangabaka_id is not None: + entry["mangabakaId"] = str(mangabaka_id) + if mangabaka_name is not None: + entry["mangabakaName"] = mangabaka_name + if image_url is not None: + entry["imageUrl"] = image_url + if first_match_time is not None: + try: + entry["firstMatchTime"] = int(first_match_time) + except (TypeError, ValueError): + pass + self._save_unlocked() + return dict(entry) + + def rename(self, old_title: str, new_title: str) -> bool: + if not new_title or old_title == new_title: + return False + with self._lock: + entry = self._data["matches"].pop(old_title, None) + if entry is None: + return False + self._data["matches"][new_title] = entry + self._save_unlocked() + return True + + def remove(self, title: str) -> bool: + with self._lock: + existed = title in self._data["matches"] + if existed: + del self._data["matches"][title] + self._save_unlocked() + return existed + + def all(self) -> dict: + with self._lock: + return {"matches": {k: dict(v) + for k, v in self._data["matches"].items()}} + + # ------------------------------------------------------------------ + # Internal IO + # ------------------------------------------------------------------ + def _load(self) -> None: + if not self._path.is_file(): + return + try: + with self._path.open("r", encoding="utf-8") as f: + loaded = json.load(f) + except (OSError, json.JSONDecodeError) as exc: + print(f"[MatchesCache] failed to load {self._path}: {exc}", + flush=True) + return + if isinstance(loaded, dict) and isinstance(loaded.get("matches"), dict): + self._data = loaded + + def _save_unlocked(self) -> None: + self._path.parent.mkdir(parents=True, exist_ok=True) + tmp = self._path.with_suffix(self._path.suffix + ".tmp") + with tmp.open("w", encoding="utf-8") as f: + json.dump(self._data, f, ensure_ascii=False, indent=2) + tmp.replace(self._path) diff --git a/src/MatchesWebApp.py b/src/MatchesWebApp.py new file mode 100644 index 0000000..4c9dcef --- /dev/null +++ b/src/MatchesWebApp.py @@ -0,0 +1,341 @@ +""" +matches_web_app.py +================== + +Flask web UI for inspecting and editing the matches.json file produced by +MatchesCache. + +Routes +------ +GET / HTML table view (one row per cached match) +GET /api/matches JSON dump of the full cache +POST /api/matches Upsert / rename an entry + body: {originalTitle?, title, mangabakaId, + mangabakaName, imageUrl, firstMatchTime?} +POST /api/matches/delete Remove an entry body: {title} +POST /api/build Trigger a full re-scan via SuwayomiMover.build_matches_only + (only available if a mover is wired in) + +The Title cell is rendered as a link to MangaBaka's search page, restricted +to the manga / manhwa / manhua types. +""" + +from __future__ import annotations + +import threading +from urllib.parse import quote_plus + +from flask import Flask, jsonify, request, Response + +from MatchesCache import MatchesCache + + +_INDEX_HTML = """ + + + + MangaBaka matches + + + +

MangaBaka matches

+
+ + + + +
+ + + + + + + + + + + + + +
TitlemangabakaIdmangabakaNamefirstMatchTimeImage
+ + + + +""" + + +class MatchesWebApp: + """ + Flask app exposing the MatchesCache. `mover` is optional — if provided, + POST /api/build triggers SuwayomiMover.build_matches_only() on a worker + thread. + """ + + def __init__(self, cache: MatchesCache, *, + mover=None, + host: str = "0.0.0.0", + port: int = 8080): + self._cache = cache + self._mover = mover + self._host = host + self._port = port + self._build_lock = threading.Lock() + self._app = Flask(__name__) + self._register_routes() + + @property + def app(self) -> Flask: + return self._app + + def start(self) -> threading.Thread: + """Starts the Flask server on a daemon thread and returns it.""" + thread = threading.Thread( + target=self._app.run, + kwargs={"host": self._host, "port": self._port, + "debug": False, "use_reloader": False, + "threaded": True}, + name="MatchesWebApp", + daemon=True, + ) + thread.start() + print(f"[MatchesWebApp] listening on {self._host}:{self._port}", + flush=True) + return thread + + # ------------------------------------------------------------------ + # Routes + # ------------------------------------------------------------------ + def _register_routes(self) -> None: + app = self._app + cache = self._cache + + @app.get("/") + def index() -> Response: + return Response(_INDEX_HTML, mimetype="text/html; charset=utf-8") + + @app.get("/api/matches") + def api_list(): + return jsonify(cache.all()) + + @app.post("/api/matches") + def api_upsert(): + body = request.get_json(silent=True) or {} + title = (body.get("title") or "").strip() + if not title: + return Response("title is required", status=400) + original = (body.get("originalTitle") or "").strip() or title + if original != title: + cache.rename(original, title) + entry = cache.upsert( + title, + mangabaka_id=body.get("mangabakaId"), + mangabaka_name=body.get("mangabakaName"), + image_url=body.get("imageUrl"), + first_match_time=body.get("firstMatchTime"), + ) + return jsonify({"title": title, "entry": entry}) + + @app.post("/api/matches/delete") + def api_delete(): + body = request.get_json(silent=True) or {} + title = (body.get("title") or "").strip() + if not title: + return Response("title is required", status=400) + removed = cache.remove(title) + return jsonify({"removed": removed, "title": title}) + + @app.post("/api/build") + def api_build(): + if self._mover is None: + return Response("no mover configured", status=503) + if not self._build_lock.acquire(blocking=False): + return Response("build already running", status=409) + try: + result = self._mover.build_matches_only() + finally: + self._build_lock.release() + return jsonify(result) diff --git a/src/SuwayomiMover.py b/src/SuwayomiMover.py index ba55904..609bf08 100644 --- a/src/SuwayomiMover.py +++ b/src/SuwayomiMover.py @@ -51,12 +51,13 @@ from pathlib import Path import requests -from ComicInfoBuilder import ComicInfoBuilder +from ComicInfoBuilder import ComicInfoBuilder, _pick_cover_url from MangadexVolumeResolver import MangaDexVolumeResolver from MangaBakaWorksResolver import MangaBakaWorksResolver from MALResolver import MALResolver from AniListResolver import AniListResolver from KavitaPersonUpdater import KavitaPersonUpdater +from MatchesCache import MatchesCache _IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".avif"} @@ -287,12 +288,16 @@ class SuwayomiMover: kavita_api_key: "str | None" = None, language: str = "en", request_timeout: int = 30, - delete_source: bool = True): + delete_source: bool = True, + matches_cache: "MatchesCache | None" = None, + api_base_url: str = "https://api.mangabaka.dev/v1"): self._src = Path(suwayomi_path) self._dst = Path(kavita_path) self._language = language self._timeout = request_timeout self._delete_source = delete_source + self._matches_cache = matches_cache + self._api_base_url = api_base_url.rstrip("/") # Shared HTTP session and resolvers — reused across all series/chapters # to maximise cache hits and minimise API round-trips. @@ -357,6 +362,73 @@ class SuwayomiMover: raise FileNotFoundError( f"No Suwayomi directory found for '{manga_title}' under {self._src}") + def build_matches_only(self) -> dict: + """ + Walks every series under the Suwayomi root and resolves each one + to a MangaBaka match — nothing else. + + For every series: + - Reads the first chapter's ComicInfo.xml to obtain the canonical + Series name (falls back to the folder name). + - Cleans the name (strips source labels) the same way the real + move pipeline does. + - If the title is already in the matches cache, skips it. + - Otherwise issues a MangaBaka search and adds the top hit to + the cache (which is persisted to disk immediately). + + Returns the full cache contents as a Python dict. + """ + if self._matches_cache is None: + raise RuntimeError( + "build_matches_only requires a MatchesCache instance") + + search_url = f"{self._api_base_url}/series/search" + + for source_dir in sorted(self._src.iterdir()): + if not source_dir.is_dir(): + continue + for manga_dir in sorted(source_dir.iterdir()): + if not manga_dir.is_dir(): + continue + + raw_series = manga_dir.name + for chapter_dir in sorted(manga_dir.iterdir(), + key=lambda p: _chapter_sort_key(p.name)): + if chapter_dir.is_dir(): + fields = _read_suwayomi_fields(chapter_dir) + if fields.get("Series"): + raw_series = fields["Series"] + break + + builder_title = _clean_suwayomi_title(raw_series) + + if self._matches_cache.get(builder_title): + print(f"[matches] {builder_title} — cached") + continue + + print(f"[matches] {builder_title} — searching") + try: + resp = self._session.get( + search_url, + params={"q": builder_title, "page": 1, "limit": 1}, + timeout=self._timeout) + resp.raise_for_status() + data = resp.json().get("data") or [] + if not data: + print(f" [warn] no MangaBaka match for {builder_title!r}") + continue + series = data[0] + self._matches_cache.add( + builder_title, + mangabaka_id=series.get("id"), + mangabaka_name=series.get("title") or "", + image_url=_pick_cover_url(series.get("cover")), + ) + except Exception as exc: + print(f" [warn] search failed for {builder_title!r}: {exc}") + + return self._matches_cache.all() + # ------------------------------------------------------------------ # Internal: series # ------------------------------------------------------------------ @@ -396,6 +468,7 @@ class SuwayomiMover: # One builder per series — metadata fetched once, reused for all chapters. builder = ComicInfoBuilder( builder_title, chapter=1, + api_base_url=self._api_base_url, language=self._language, request_timeout=self._timeout, session=self._session, @@ -403,6 +476,7 @@ class SuwayomiMover: works_resolver=self._works_resolver, mal_resolver=self._mal, al_resolver=self._al, + matches_cache=self._matches_cache, ) # Fetch MangaBaka metadata now to get the canonical title and MAL ID.