""" suwayomi_mover.py ================= Moves Suwayomi-downloaded manga chapters to a Kavita library path, generating enriched ComicInfo.xml metadata and packing each chapter folder into a CBZ archive. Optionally syncs Kavita person / character records with MyAnimeList data after each series is processed. Suwayomi folder structure (input) ---------------------------------- / / e.g. "ComicK Fanmade (EN)" / e.g. "Yofukashi no Uta" Official_Chapter 1/ chapter folder — any prefix is fine 001.webp ... ComicInfo.xml Suwayomi's own basic XML (read + replaced) Kavita folder structure (output) --------------------------------- / / Official_Chapter 1.cbz CBZ archive: images + enriched ComicInfo.xml Official_Chapter 2.cbz ... Cover naming convention ----------------------- The cover image is saved as "000." inside each chapter folder so that it sorts before "001.webp", "002.webp", … in alphabetical order. This ensures the assignment in ComicInfo.xml matches the actual file order inside the CBZ archive. Dependencies ------------ requests -> pip install requests Pillow -> pip install pillow (optional, for image dimensions) ComicInfoBuilder, MangadexVolumeResolver, MangaBakaWorksResolver, MALResolver, KavitaPersonUpdater must reside in the same directory. """ from __future__ import annotations import re import shutil import xml.etree.ElementTree as ET import zipfile from pathlib import Path import requests from ComicInfoBuilder import ComicInfoBuilder, _pick_cover_url from MangadexVolumeResolver import MangaDexVolumeResolver from MangaBakaWorksResolver import MangaBakaWorksResolver from MALResolver import MALResolver from AniListResolver import AniListResolver from KavitaPersonUpdater import KavitaPersonUpdater from MatchesCache import MatchesCache _IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".avif"} _CHAPTER_RE = re.compile(r'[Cc]hapter\s+(\d+(?:\.\d+)?)') # Parenthetical source labels that Suwayomi appends to series names. # These are not part of the actual title and confuse MangaBaka searches. _SOURCE_LABEL_RE = re.compile( r'\s*\(\s*(?:official|unofficial|fan(?:\s*made)?|scanlation|' r'bato(?:to)?|mangadex|manga\s*plus|viz|yen\s*press|webtoon)\s*\)\s*$', re.IGNORECASE, ) # Characters that Windows (and SMB shares) forbid in path components. _WIN_ILLEGAL_RE = re.compile(r'[\\/*?"<>|]') def _natural_key(name: str) -> list: return [int(p) if p.isdigit() else p.lower() for p in re.split(r"(\d+)", name)] def _sanitize_dirname(name: str) -> str: """ Makes a string safe to use as a Windows (or SMB) directory name. Rules applied: - ": " or ":" surrounded by optional spaces -> " - " ("Call of the Night: Paradise Arc" -> "Call of the Night - Paradise Arc") - Remaining Windows-illegal chars (\\ / * ? " < > |) are stripped. - Leading/trailing dots and spaces are removed (Windows restriction). """ name = re.sub(r"\s*:\s*", " - ", name) name = _WIN_ILLEGAL_RE.sub("", name) return name.strip(". ") _SUWAYOMI_WANTED = {"Title", "Series", "Number", "Summary", "Writer", "Penciller", "Genre", "Web", "Year", "Month", "Day"} def _read_suwayomi_fields(chapter_dir: Path) -> dict: """ Reads metadata from Suwayomi's ComicInfo.xml inside a chapter folder. Returns a dict of whichever fields are present, e.g.: {"Number": "3", "Series": "Dungeon Odyssey", "Title": "Chapter 3", ...} Returns an empty dict if the file is missing or unparseable. """ xml_path = chapter_dir / "ComicInfo.xml" if not xml_path.is_file(): return {} try: root = ET.parse(xml_path).getroot() except ET.ParseError: return {} result = {} for child in root: tag = child.tag.split("}")[-1] if tag in _SUWAYOMI_WANTED and child.text and child.text.strip(): result[tag] = child.text.strip() return result def _clean_suwayomi_title(title: str) -> str: """ Removes Suwayomi source annotations from a series title. Suwayomi sometimes appends the translation group / source type in parentheses, e.g. "Wistoria: Wand and Sword (Official)". These labels are not part of the canonical title and break MangaBaka / MAL lookups. """ return _SOURCE_LABEL_RE.sub("", title).strip() def _mal_id_from_metadata(md: dict) -> "int | None": """Extracts the MAL ID from a MangaBaka series dict's source map.""" for raw_key, info in (md.get("source") or {}).items(): if re.sub(r"[^a-z0-9]", "", raw_key.lower()) in ("myanimelist", "mal"): if isinstance(info, dict): mal_id = info.get("id") if mal_id is not None: try: return int(mal_id) except (TypeError, ValueError): pass return None def _al_id_from_metadata(md: dict) -> "int | None": """Extracts the AniList ID from a MangaBaka series dict's source map.""" for raw_key, info in (md.get("source") or {}).items(): if re.sub(r"[^a-z0-9]", "", raw_key.lower()) == "anilist": if isinstance(info, dict): al_id = info.get("id") if al_id is not None: try: return int(al_id) except (TypeError, ValueError): pass return None def _chapter_image_size(chapter_dir: Path) -> int: """Returns the total file size of all images in a chapter folder.""" return sum( f.stat().st_size for f in chapter_dir.iterdir() if f.is_file() and f.suffix.lower() in _IMAGE_EXTS ) def _deduplicate_chapters( chapter_items: list[tuple[Path, dict, str]], ) -> tuple[list[tuple[Path, dict, str]], list[Path]]: """ When multiple chapter folders share the exact same chapter number (e.g. two folders for chapter "2" — not "2" vs "2.2"), keeps only the one with the highest total image file size, which is a reliable proxy for image quality. Chapter number comes from ComicInfo.xml ; comparison is an exact string match so "2" and "2.2" are never considered duplicates. Returns ------- kept : deduplicated chapter_items list (original sort order preserved) rejected : Path list of lower-quality duplicate folders to be removed """ best: dict[str, tuple[Path, dict, str]] = {} best_size: dict[str, int] = {} rejected: list[Path] = [] for item in chapter_items: chapter_dir, fields, chapter_num = item size = _chapter_image_size(chapter_dir) if chapter_num not in best: best[chapter_num] = item best_size[chapter_num] = size elif size > best_size[chapter_num]: prev_dir = best[chapter_num][0] print(f" [dup] ch.{chapter_num}: replacing {prev_dir.name!r} " f"({best_size[chapter_num]:,}B) with {chapter_dir.name!r} " f"({size:,}B) — higher quality") rejected.append(prev_dir) best[chapter_num] = item best_size[chapter_num] = size else: print(f" [dup] ch.{chapter_num}: skipping {chapter_dir.name!r} " f"({size:,}B), keeping {best[chapter_num][0].name!r} " f"({best_size[chapter_num]:,}B)") rejected.append(chapter_dir) return list(best.values()), rejected def _extract_chapter_num(folder_name: str) -> "str | None": """ Fallback: extracts chapter number from the folder name. Examples: "Chapter 10" -> "10", "Official_Chapter 10.5" -> "10.5" """ m = _CHAPTER_RE.search(folder_name) return m.group(1) if m else None def _chapter_sort_key(folder_name: str) -> tuple: """Numeric sort key for chapter folder names.""" num = _extract_chapter_num(folder_name) if num is None: return (float("inf"), folder_name) return (float(num), folder_name) def _pack_to_cbz(folder: Path, dest: Path) -> None: """ Packs all files in `folder` into a CBZ archive at `dest`. Images are stored in natural-sort order (so "000.jpg" < "001.webp"). ComicInfo.xml is appended last so image indices in the archive match the entries written by ComicInfoBuilder. Files are stored without compression (ZIP_STORED) since the source images are already compressed (webp / jpg / png / …). """ images = sorted( [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() in _IMAGE_EXTS], key=lambda p: _natural_key(p.name), ) extras = [ f for f in folder.iterdir() if f.is_file() and f.suffix.lower() not in _IMAGE_EXTS ] with zipfile.ZipFile(dest, "w", zipfile.ZIP_STORED) as zf: for f in images: zf.write(f, f.name) for f in extras: zf.write(f, f.name) class SuwayomiMover: """ Scans a Suwayomi download directory, generates enriched ComicInfo.xml for each chapter, packs each chapter folder into a CBZ archive, and moves the result to a Kavita library path. Parameters ---------- suwayomi_path : Root of Suwayomi downloads. Expected layout: ///<Chapter N>/ kavita_path : Root of the Kavita library. Series sub-directories are created automatically. kavita_base_url : Kavita server URL — required only for person sync, e.g. "http://192.168.2.2:5000". kavita_api_key : Kavita API key — required only for person sync. language : ComicInfo LanguageISO and SeriesSort language ("en"). request_timeout : HTTP timeout in seconds for all API / image requests. delete_source : Remove the source chapter folder after successful pack. """ def __init__(self, suwayomi_path, kavita_path, *, kavita_base_url: "str | None" = None, kavita_api_key: "str | None" = None, language: str = "en", request_timeout: int = 30, delete_source: bool = True, matches_cache: "MatchesCache | None" = None, api_base_url: str = "https://api.mangabaka.dev/v1"): self._src = Path(suwayomi_path) self._dst = Path(kavita_path) self._language = language self._timeout = request_timeout self._delete_source = delete_source self._matches_cache = matches_cache self._api_base_url = api_base_url.rstrip("/") # Shared HTTP session and resolvers — reused across all series/chapters # to maximise cache hits and minimise API round-trips. session = requests.Session() session.headers.setdefault("User-Agent", "SuwayomiMover/1.0") self._session = session self._mal = MALResolver(request_timeout=request_timeout) self._al = AniListResolver(request_timeout=request_timeout) self._vol_resolver = MangaDexVolumeResolver( request_timeout=request_timeout, session=session) self._works_resolver = MangaBakaWorksResolver( request_timeout=request_timeout, session=session) self._person_updater: "KavitaPersonUpdater | None" = None if kavita_base_url and kavita_api_key: self._person_updater = KavitaPersonUpdater( kavita_base_url, kavita_api_key, mal_resolver=self._mal, al_resolver=self._al, request_timeout=request_timeout) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def process_all(self) -> dict: """ Processes every manga series found under the Suwayomi root. Walks two directory levels deep: <suwayomi_path>/<Source dir>/<Manga Title>/ Returns a dict keyed by manga title, each value being the result dict from _process_series_dir. """ results: dict = {} for source_dir in sorted(self._src.iterdir()): if not source_dir.is_dir(): continue for manga_dir in sorted(source_dir.iterdir()): if not manga_dir.is_dir(): continue title = manga_dir.name print(f"[SuwayomiMover] {title}") results[title] = self._process_series_dir(manga_dir) return results def process_series(self, manga_title: str) -> dict: """ Processes all chapters for a single series, located by title. Searches every source sub-directory under the Suwayomi root for a directory whose name matches `manga_title` exactly. Raises FileNotFoundError if no matching directory is found. """ for source_dir in sorted(self._src.iterdir()): if not source_dir.is_dir(): continue candidate = source_dir / manga_title if candidate.is_dir(): return self._process_series_dir(candidate) raise FileNotFoundError( f"No Suwayomi directory found for '{manga_title}' under {self._src}") def build_matches_only(self) -> dict: """ Walks every series under the Suwayomi root and resolves each one to a MangaBaka match — nothing else. For every series: - Reads the first chapter's ComicInfo.xml to obtain the canonical Series name (falls back to the folder name). - Cleans the name (strips source labels) the same way the real move pipeline does. - If the title is already in the matches cache, skips it. - Otherwise issues a MangaBaka search and adds the top hit to the cache (which is persisted to disk immediately). Returns the full cache contents as a Python dict. """ if self._matches_cache is None: raise RuntimeError( "build_matches_only requires a MatchesCache instance") search_url = f"{self._api_base_url}/series/search" for source_dir in sorted(self._src.iterdir()): if not source_dir.is_dir(): continue for manga_dir in sorted(source_dir.iterdir()): if not manga_dir.is_dir(): continue raw_series = manga_dir.name for chapter_dir in sorted(manga_dir.iterdir(), key=lambda p: _chapter_sort_key(p.name)): if chapter_dir.is_dir(): fields = _read_suwayomi_fields(chapter_dir) if fields.get("Series"): raw_series = fields["Series"] break builder_title = _clean_suwayomi_title(raw_series) if self._matches_cache.get(builder_title): print(f"[matches] {builder_title} — cached") continue print(f"[matches] {builder_title} — searching") try: resp = self._session.get( search_url, params={"q": builder_title, "page": 1, "limit": 1}, timeout=self._timeout) resp.raise_for_status() data = resp.json().get("data") or [] if not data: print(f" [warn] no MangaBaka match for {builder_title!r}") continue series = data[0] self._matches_cache.add( builder_title, mangabaka_id=series.get("id"), mangabaka_name=series.get("title") or "", image_url=_pick_cover_url(series.get("cover")), ) except Exception as exc: print(f" [warn] search failed for {builder_title!r}: {exc}") return self._matches_cache.all() # ------------------------------------------------------------------ # Internal: series # ------------------------------------------------------------------ def _process_series_dir(self, manga_dir: Path) -> dict: manga_title = manga_dir.name chapter_dirs = sorted( (d for d in manga_dir.iterdir() if d.is_dir()), key=lambda p: _chapter_sort_key(p.name), ) # Read all chapter XMLs upfront to resolve chapter numbers and series name. chapter_items: list[tuple[Path, dict, str]] = [] for chapter_dir in chapter_dirs: fields = _read_suwayomi_fields(chapter_dir) chapter_num = (fields.get("Number") or _extract_chapter_num(chapter_dir.name)) if chapter_num is None: print(f" [skip] {chapter_dir.name} — no chapter number") continue chapter_items.append((chapter_dir, fields, chapter_num)) chapter_items, rejected_dirs = _deduplicate_chapters(chapter_items) if self._delete_source: for d in rejected_dirs: shutil.rmtree(d, ignore_errors=True) # <Series> from the first chapter's XML → strip source labels → clean title # for the MangaBaka search. Folder name is the last resort. raw_series = manga_title if chapter_items: xml_series = chapter_items[0][1].get("Series") if xml_series: raw_series = xml_series builder_title = _clean_suwayomi_title(raw_series) # One builder per series — metadata fetched once, reused for all chapters. builder = ComicInfoBuilder( builder_title, chapter=1, api_base_url=self._api_base_url, language=self._language, request_timeout=self._timeout, session=self._session, volume_resolver=self._vol_resolver, works_resolver=self._works_resolver, mal_resolver=self._mal, al_resolver=self._al, matches_cache=self._matches_cache, ) # Fetch MangaBaka metadata now to get the canonical title and MAL ID. md: "dict | None" = None mangabaka_title = manga_title try: md = builder.fetch_metadata() mangabaka_title = md.get("title") or manga_title except Exception as exc: print(f" [warn] metadata fetch failed: {exc}") # Destination folder uses the MangaBaka canonical title, sanitized for # Windows / SMB paths (no colons, illegal chars, leading/trailing dots). dest_series = self._dst / _sanitize_dirname(mangabaka_title) dest_series.mkdir(parents=True, exist_ok=True) chapter_results: list[dict] = [] for chapter_dir, _fields, chapter_num in chapter_items: result = self._process_chapter( builder, chapter_num, chapter_dir, dest_series) chapter_results.append(result) status = "ok" if result["ok"] else f"ERROR: {result.get('error')}" print(f" Chapter {chapter_num}: {status}") # Sync Kavita persons once per series. # Both MAL and AniList IDs come from MangaBaka's source map; # AniList is used as fallback when MAL returns no characters/staff. person_result: "dict | None" = None if self._person_updater: mal_id = (_mal_id_from_metadata(md) if md else None or self._mal.find_mal_id(builder_title)) al_id = _al_id_from_metadata(md) if md else None if mal_id or al_id: try: person_result = self._person_updater.update_for_manga( mal_id, al_manga_id=al_id) print(f" Persons: chars={person_result['characters'].get('updated')} " f"staff={person_result['staff'].get('updated')}") except Exception as exc: person_result = {"error": str(exc)} print(f" Persons: ERROR {exc}") return {"chapters": chapter_results, "persons": person_result} # ------------------------------------------------------------------ # Internal: chapter # ------------------------------------------------------------------ def _process_chapter(self, builder: ComicInfoBuilder, chapter_num: str, chapter_dir: Path, dest_series: Path) -> dict: """ Generates ComicInfo.xml for one chapter, packs it to CBZ, and optionally removes the source folder. The cover image is saved as "000.<ext>" so it sorts before the numbered story pages in the archive (ensuring Image=0 in the <Pages> element correctly points to the front cover). """ cbz_path = dest_series / f"{chapter_dir.name}.cbz" try: builder.chapter = chapter_num builder.add_pages_from_folder(chapter_dir, cover_filename="000") builder.save_xml(chapter_dir) _pack_to_cbz(chapter_dir, cbz_path) if self._delete_source: shutil.rmtree(chapter_dir) return {"chapter": chapter_num, "cbz": str(cbz_path), "ok": True} except Exception as exc: return {"chapter": chapter_num, "cbz": str(cbz_path), "ok": False, "error": str(exc)} # -------------------------------------------------------------------------- # Usage example # -------------------------------------------------------------------------- if __name__ == "__main__": SUWAYOMI_PATH = r"\\192.168.2.2\root\Temp\managdl\mangas" KAVITA_PATH = r"\\192.168.2.2\root\ServerData\Kavita\test" KAVITA_URL = "http://192.168.2.2:5000" KAVITA_KEY = "Sq4a3hcV171dn3gzCl0K4eN7hZNk4sOA" mover = SuwayomiMover( SUWAYOMI_PATH, KAVITA_PATH, kavita_base_url=KAVITA_URL, kavita_api_key=KAVITA_KEY, delete_source=False ) # Process a single series result = mover.process_series("Yofukashi no Uta") ok = sum(1 for c in result["chapters"] if c["ok"]) failed = sum(1 for c in result["chapters"] if not c["ok"]) print(f"\nDone: {ok} ok, {failed} failed") for c in result["chapters"]: if not c["ok"]: print(f" Chapter {c['chapter']}: {c['error']}") # Or process everything at once: # results = mover.process_all()