Files
manga-mover-and-metadata-co…/src/SuwayomiMover.py
T
johannesbot 615bd1b468
Build and Deploy / build (push) Successful in 32s
Build and Deploy / deploy (push) Successful in 25s
manga matching and WebApp
2026-05-26 20:20:24 +02:00

582 lines
23 KiB
Python

"""
suwayomi_mover.py
=================
Moves Suwayomi-downloaded manga chapters to a Kavita library path,
generating enriched ComicInfo.xml metadata and packing each chapter
folder into a CBZ archive. Optionally syncs Kavita person / character
records with MyAnimeList data after each series is processed.
Suwayomi folder structure (input)
----------------------------------
<suwayomi_path>/
<Source (lang)>/ e.g. "ComicK Fanmade (EN)"
<Manga Title>/ e.g. "Yofukashi no Uta"
Official_Chapter 1/ chapter folder — any prefix is fine
001.webp
...
ComicInfo.xml Suwayomi's own basic XML (read + replaced)
Kavita folder structure (output)
---------------------------------
<kavita_path>/
<Manga Title>/
Official_Chapter 1.cbz CBZ archive: images + enriched ComicInfo.xml
Official_Chapter 2.cbz
...
Cover naming convention
-----------------------
The cover image is saved as "000.<ext>" inside each chapter folder so that
it sorts before "001.webp", "002.webp", … in alphabetical order. This
ensures the <Pages Image="0" Type="FrontCover"> assignment in ComicInfo.xml
matches the actual file order inside the CBZ archive.
Dependencies
------------
requests -> pip install requests
Pillow -> pip install pillow (optional, for image dimensions)
ComicInfoBuilder, MangadexVolumeResolver, MangaBakaWorksResolver,
MALResolver, KavitaPersonUpdater must reside in the same directory.
"""
from __future__ import annotations
import re
import shutil
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
import requests
from ComicInfoBuilder import ComicInfoBuilder, _pick_cover_url
from MangadexVolumeResolver import MangaDexVolumeResolver
from MangaBakaWorksResolver import MangaBakaWorksResolver
from MALResolver import MALResolver
from AniListResolver import AniListResolver
from KavitaPersonUpdater import KavitaPersonUpdater
from MatchesCache import MatchesCache
_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".avif"}
_CHAPTER_RE = re.compile(r'[Cc]hapter\s+(\d+(?:\.\d+)?)')
# Parenthetical source labels that Suwayomi appends to series names.
# These are not part of the actual title and confuse MangaBaka searches.
_SOURCE_LABEL_RE = re.compile(
r'\s*\(\s*(?:official|unofficial|fan(?:\s*made)?|scanlation|'
r'bato(?:to)?|mangadex|manga\s*plus|viz|yen\s*press|webtoon)\s*\)\s*$',
re.IGNORECASE,
)
# Characters that Windows (and SMB shares) forbid in path components.
_WIN_ILLEGAL_RE = re.compile(r'[\\/*?"<>|]')
def _natural_key(name: str) -> list:
return [int(p) if p.isdigit() else p.lower()
for p in re.split(r"(\d+)", name)]
def _sanitize_dirname(name: str) -> str:
"""
Makes a string safe to use as a Windows (or SMB) directory name.
Rules applied:
- ": " or ":" surrounded by optional spaces -> " - "
("Call of the Night: Paradise Arc" -> "Call of the Night - Paradise Arc")
- Remaining Windows-illegal chars (\\ / * ? " < > |) are stripped.
- Leading/trailing dots and spaces are removed (Windows restriction).
"""
name = re.sub(r"\s*:\s*", " - ", name)
name = _WIN_ILLEGAL_RE.sub("", name)
return name.strip(". ")
_SUWAYOMI_WANTED = {"Title", "Series", "Number", "Summary",
"Writer", "Penciller", "Genre", "Web",
"Year", "Month", "Day"}
def _read_suwayomi_fields(chapter_dir: Path) -> dict:
"""
Reads metadata from Suwayomi's ComicInfo.xml inside a chapter folder.
Returns a dict of whichever fields are present, e.g.:
{"Number": "3", "Series": "Dungeon Odyssey", "Title": "Chapter 3", ...}
Returns an empty dict if the file is missing or unparseable.
"""
xml_path = chapter_dir / "ComicInfo.xml"
if not xml_path.is_file():
return {}
try:
root = ET.parse(xml_path).getroot()
except ET.ParseError:
return {}
result = {}
for child in root:
tag = child.tag.split("}")[-1]
if tag in _SUWAYOMI_WANTED and child.text and child.text.strip():
result[tag] = child.text.strip()
return result
def _clean_suwayomi_title(title: str) -> str:
"""
Removes Suwayomi source annotations from a series title.
Suwayomi sometimes appends the translation group / source type in
parentheses, e.g. "Wistoria: Wand and Sword (Official)". These labels
are not part of the canonical title and break MangaBaka / MAL lookups.
"""
return _SOURCE_LABEL_RE.sub("", title).strip()
def _mal_id_from_metadata(md: dict) -> "int | None":
"""Extracts the MAL ID from a MangaBaka series dict's source map."""
for raw_key, info in (md.get("source") or {}).items():
if re.sub(r"[^a-z0-9]", "", raw_key.lower()) in ("myanimelist", "mal"):
if isinstance(info, dict):
mal_id = info.get("id")
if mal_id is not None:
try:
return int(mal_id)
except (TypeError, ValueError):
pass
return None
def _al_id_from_metadata(md: dict) -> "int | None":
"""Extracts the AniList ID from a MangaBaka series dict's source map."""
for raw_key, info in (md.get("source") or {}).items():
if re.sub(r"[^a-z0-9]", "", raw_key.lower()) == "anilist":
if isinstance(info, dict):
al_id = info.get("id")
if al_id is not None:
try:
return int(al_id)
except (TypeError, ValueError):
pass
return None
def _chapter_image_size(chapter_dir: Path) -> int:
"""Returns the total file size of all images in a chapter folder."""
return sum(
f.stat().st_size
for f in chapter_dir.iterdir()
if f.is_file() and f.suffix.lower() in _IMAGE_EXTS
)
def _deduplicate_chapters(
chapter_items: list[tuple[Path, dict, str]],
) -> tuple[list[tuple[Path, dict, str]], list[Path]]:
"""
When multiple chapter folders share the exact same chapter number
(e.g. two folders for chapter "2" — not "2" vs "2.2"), keeps only the
one with the highest total image file size, which is a reliable proxy
for image quality.
Chapter number comes from ComicInfo.xml <Number>; comparison is an exact
string match so "2" and "2.2" are never considered duplicates.
Returns
-------
kept : deduplicated chapter_items list (original sort order preserved)
rejected : Path list of lower-quality duplicate folders to be removed
"""
best: dict[str, tuple[Path, dict, str]] = {}
best_size: dict[str, int] = {}
rejected: list[Path] = []
for item in chapter_items:
chapter_dir, fields, chapter_num = item
size = _chapter_image_size(chapter_dir)
if chapter_num not in best:
best[chapter_num] = item
best_size[chapter_num] = size
elif size > best_size[chapter_num]:
prev_dir = best[chapter_num][0]
print(f" [dup] ch.{chapter_num}: replacing {prev_dir.name!r} "
f"({best_size[chapter_num]:,}B) with {chapter_dir.name!r} "
f"({size:,}B) — higher quality")
rejected.append(prev_dir)
best[chapter_num] = item
best_size[chapter_num] = size
else:
print(f" [dup] ch.{chapter_num}: skipping {chapter_dir.name!r} "
f"({size:,}B), keeping {best[chapter_num][0].name!r} "
f"({best_size[chapter_num]:,}B)")
rejected.append(chapter_dir)
return list(best.values()), rejected
def _extract_chapter_num(folder_name: str) -> "str | None":
"""
Fallback: extracts chapter number from the folder name.
Examples: "Chapter 10" -> "10", "Official_Chapter 10.5" -> "10.5"
"""
m = _CHAPTER_RE.search(folder_name)
return m.group(1) if m else None
def _chapter_sort_key(folder_name: str) -> tuple:
"""Numeric sort key for chapter folder names."""
num = _extract_chapter_num(folder_name)
if num is None:
return (float("inf"), folder_name)
return (float(num), folder_name)
def _pack_to_cbz(folder: Path, dest: Path) -> None:
"""
Packs all files in `folder` into a CBZ archive at `dest`.
Images are stored in natural-sort order (so "000.jpg" < "001.webp").
ComicInfo.xml is appended last so image indices in the archive match
the <Pages> entries written by ComicInfoBuilder.
Files are stored without compression (ZIP_STORED) since the source
images are already compressed (webp / jpg / png / …).
"""
images = sorted(
[f for f in folder.iterdir()
if f.is_file() and f.suffix.lower() in _IMAGE_EXTS],
key=lambda p: _natural_key(p.name),
)
extras = [
f for f in folder.iterdir()
if f.is_file() and f.suffix.lower() not in _IMAGE_EXTS
]
with zipfile.ZipFile(dest, "w", zipfile.ZIP_STORED) as zf:
for f in images:
zf.write(f, f.name)
for f in extras:
zf.write(f, f.name)
class SuwayomiMover:
"""
Scans a Suwayomi download directory, generates enriched ComicInfo.xml
for each chapter, packs each chapter folder into a CBZ archive, and
moves the result to a Kavita library path.
Parameters
----------
suwayomi_path : Root of Suwayomi downloads.
Expected layout: <root>/<Source>/<Title>/<Chapter N>/
kavita_path : Root of the Kavita library.
Series sub-directories are created automatically.
kavita_base_url : Kavita server URL — required only for person sync,
e.g. "http://192.168.2.2:5000".
kavita_api_key : Kavita API key — required only for person sync.
language : ComicInfo LanguageISO and SeriesSort language ("en").
request_timeout : HTTP timeout in seconds for all API / image requests.
delete_source : Remove the source chapter folder after successful pack.
"""
def __init__(self,
suwayomi_path,
kavita_path,
*,
kavita_base_url: "str | None" = None,
kavita_api_key: "str | None" = None,
language: str = "en",
request_timeout: int = 30,
delete_source: bool = True,
matches_cache: "MatchesCache | None" = None,
api_base_url: str = "https://api.mangabaka.dev/v1"):
self._src = Path(suwayomi_path)
self._dst = Path(kavita_path)
self._language = language
self._timeout = request_timeout
self._delete_source = delete_source
self._matches_cache = matches_cache
self._api_base_url = api_base_url.rstrip("/")
# Shared HTTP session and resolvers — reused across all series/chapters
# to maximise cache hits and minimise API round-trips.
session = requests.Session()
session.headers.setdefault("User-Agent", "SuwayomiMover/1.0")
self._session = session
self._mal = MALResolver(request_timeout=request_timeout)
self._al = AniListResolver(request_timeout=request_timeout)
self._vol_resolver = MangaDexVolumeResolver(
request_timeout=request_timeout, session=session)
self._works_resolver = MangaBakaWorksResolver(
request_timeout=request_timeout, session=session)
self._person_updater: "KavitaPersonUpdater | None" = None
if kavita_base_url and kavita_api_key:
self._person_updater = KavitaPersonUpdater(
kavita_base_url, kavita_api_key,
mal_resolver=self._mal,
al_resolver=self._al,
request_timeout=request_timeout)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def process_all(self) -> dict:
"""
Processes every manga series found under the Suwayomi root.
Walks two directory levels deep:
<suwayomi_path>/<Source dir>/<Manga Title>/
Returns a dict keyed by manga title, each value being the result
dict from _process_series_dir.
"""
results: dict = {}
for source_dir in sorted(self._src.iterdir()):
if not source_dir.is_dir():
continue
for manga_dir in sorted(source_dir.iterdir()):
if not manga_dir.is_dir():
continue
title = manga_dir.name
print(f"[SuwayomiMover] {title}")
results[title] = self._process_series_dir(manga_dir)
return results
def process_series(self, manga_title: str) -> dict:
"""
Processes all chapters for a single series, located by title.
Searches every source sub-directory under the Suwayomi root for a
directory whose name matches `manga_title` exactly.
Raises FileNotFoundError if no matching directory is found.
"""
for source_dir in sorted(self._src.iterdir()):
if not source_dir.is_dir():
continue
candidate = source_dir / manga_title
if candidate.is_dir():
return self._process_series_dir(candidate)
raise FileNotFoundError(
f"No Suwayomi directory found for '{manga_title}' under {self._src}")
def build_matches_only(self) -> dict:
"""
Walks every series under the Suwayomi root and resolves each one
to a MangaBaka match — nothing else.
For every series:
- Reads the first chapter's ComicInfo.xml to obtain the canonical
Series name (falls back to the folder name).
- Cleans the name (strips source labels) the same way the real
move pipeline does.
- If the title is already in the matches cache, skips it.
- Otherwise issues a MangaBaka search and adds the top hit to
the cache (which is persisted to disk immediately).
Returns the full cache contents as a Python dict.
"""
if self._matches_cache is None:
raise RuntimeError(
"build_matches_only requires a MatchesCache instance")
search_url = f"{self._api_base_url}/series/search"
for source_dir in sorted(self._src.iterdir()):
if not source_dir.is_dir():
continue
for manga_dir in sorted(source_dir.iterdir()):
if not manga_dir.is_dir():
continue
raw_series = manga_dir.name
for chapter_dir in sorted(manga_dir.iterdir(),
key=lambda p: _chapter_sort_key(p.name)):
if chapter_dir.is_dir():
fields = _read_suwayomi_fields(chapter_dir)
if fields.get("Series"):
raw_series = fields["Series"]
break
builder_title = _clean_suwayomi_title(raw_series)
if self._matches_cache.get(builder_title):
print(f"[matches] {builder_title} — cached")
continue
print(f"[matches] {builder_title} — searching")
try:
resp = self._session.get(
search_url,
params={"q": builder_title, "page": 1, "limit": 1},
timeout=self._timeout)
resp.raise_for_status()
data = resp.json().get("data") or []
if not data:
print(f" [warn] no MangaBaka match for {builder_title!r}")
continue
series = data[0]
self._matches_cache.add(
builder_title,
mangabaka_id=series.get("id"),
mangabaka_name=series.get("title") or "",
image_url=_pick_cover_url(series.get("cover")),
)
except Exception as exc:
print(f" [warn] search failed for {builder_title!r}: {exc}")
return self._matches_cache.all()
# ------------------------------------------------------------------
# Internal: series
# ------------------------------------------------------------------
def _process_series_dir(self, manga_dir: Path) -> dict:
manga_title = manga_dir.name
chapter_dirs = sorted(
(d for d in manga_dir.iterdir() if d.is_dir()),
key=lambda p: _chapter_sort_key(p.name),
)
# Read all chapter XMLs upfront to resolve chapter numbers and series name.
chapter_items: list[tuple[Path, dict, str]] = []
for chapter_dir in chapter_dirs:
fields = _read_suwayomi_fields(chapter_dir)
chapter_num = (fields.get("Number")
or _extract_chapter_num(chapter_dir.name))
if chapter_num is None:
print(f" [skip] {chapter_dir.name} — no chapter number")
continue
chapter_items.append((chapter_dir, fields, chapter_num))
chapter_items, rejected_dirs = _deduplicate_chapters(chapter_items)
if self._delete_source:
for d in rejected_dirs:
shutil.rmtree(d, ignore_errors=True)
# <Series> from the first chapter's XML → strip source labels → clean title
# for the MangaBaka search. Folder name is the last resort.
raw_series = manga_title
if chapter_items:
xml_series = chapter_items[0][1].get("Series")
if xml_series:
raw_series = xml_series
builder_title = _clean_suwayomi_title(raw_series)
# One builder per series — metadata fetched once, reused for all chapters.
builder = ComicInfoBuilder(
builder_title, chapter=1,
api_base_url=self._api_base_url,
language=self._language,
request_timeout=self._timeout,
session=self._session,
volume_resolver=self._vol_resolver,
works_resolver=self._works_resolver,
mal_resolver=self._mal,
al_resolver=self._al,
matches_cache=self._matches_cache,
)
# Fetch MangaBaka metadata now to get the canonical title and MAL ID.
md: "dict | None" = None
mangabaka_title = manga_title
try:
md = builder.fetch_metadata()
mangabaka_title = md.get("title") or manga_title
except Exception as exc:
print(f" [warn] metadata fetch failed: {exc}")
# Destination folder uses the MangaBaka canonical title, sanitized for
# Windows / SMB paths (no colons, illegal chars, leading/trailing dots).
dest_series = self._dst / _sanitize_dirname(mangabaka_title)
dest_series.mkdir(parents=True, exist_ok=True)
chapter_results: list[dict] = []
for chapter_dir, _fields, chapter_num in chapter_items:
result = self._process_chapter(
builder, chapter_num, chapter_dir, dest_series)
chapter_results.append(result)
status = "ok" if result["ok"] else f"ERROR: {result.get('error')}"
print(f" Chapter {chapter_num}: {status}")
# Sync Kavita persons once per series.
# Both MAL and AniList IDs come from MangaBaka's source map;
# AniList is used as fallback when MAL returns no characters/staff.
person_result: "dict | None" = None
if self._person_updater:
mal_id = (_mal_id_from_metadata(md) if md else None
or self._mal.find_mal_id(builder_title))
al_id = _al_id_from_metadata(md) if md else None
if mal_id or al_id:
try:
person_result = self._person_updater.update_for_manga(
mal_id, al_manga_id=al_id)
print(f" Persons: chars={person_result['characters'].get('updated')} "
f"staff={person_result['staff'].get('updated')}")
except Exception as exc:
person_result = {"error": str(exc)}
print(f" Persons: ERROR {exc}")
return {"chapters": chapter_results, "persons": person_result}
# ------------------------------------------------------------------
# Internal: chapter
# ------------------------------------------------------------------
def _process_chapter(self,
builder: ComicInfoBuilder,
chapter_num: str,
chapter_dir: Path,
dest_series: Path) -> dict:
"""
Generates ComicInfo.xml for one chapter, packs it to CBZ, and
optionally removes the source folder.
The cover image is saved as "000.<ext>" so it sorts before the
numbered story pages in the archive (ensuring Image=0 in the
<Pages> element correctly points to the front cover).
"""
cbz_path = dest_series / f"{chapter_dir.name}.cbz"
try:
builder.chapter = chapter_num
builder.add_pages_from_folder(chapter_dir, cover_filename="000")
builder.save_xml(chapter_dir)
_pack_to_cbz(chapter_dir, cbz_path)
if self._delete_source:
shutil.rmtree(chapter_dir)
return {"chapter": chapter_num, "cbz": str(cbz_path), "ok": True}
except Exception as exc:
return {"chapter": chapter_num, "cbz": str(cbz_path),
"ok": False, "error": str(exc)}
# --------------------------------------------------------------------------
# Usage example
# --------------------------------------------------------------------------
if __name__ == "__main__":
SUWAYOMI_PATH = r"\\192.168.2.2\root\Temp\managdl\mangas"
KAVITA_PATH = r"\\192.168.2.2\root\ServerData\Kavita\test"
KAVITA_URL = "http://192.168.2.2:5000"
KAVITA_KEY = "Sq4a3hcV171dn3gzCl0K4eN7hZNk4sOA"
mover = SuwayomiMover(
SUWAYOMI_PATH,
KAVITA_PATH,
kavita_base_url=KAVITA_URL,
kavita_api_key=KAVITA_KEY,
delete_source=False
)
# Process a single series
result = mover.process_series("Yofukashi no Uta")
ok = sum(1 for c in result["chapters"] if c["ok"])
failed = sum(1 for c in result["chapters"] if not c["ok"])
print(f"\nDone: {ok} ok, {failed} failed")
for c in result["chapters"]:
if not c["ok"]:
print(f" Chapter {c['chapter']}: {c['error']}")
# Or process everything at once:
# results = mover.process_all()