manga-mover-and-metadata-co…/src/SuwayomiMover.py

"""
suwayomi_mover.py
=================

Moves Suwayomi-downloaded manga chapters to a Kavita library path,
generating enriched ComicInfo.xml metadata and packing each chapter
folder into a CBZ archive.  Optionally syncs Kavita person / character
records with MyAnimeList data after each series is processed.

Suwayomi folder structure (input)
----------------------------------
  <suwayomi_path>/
    <Source (lang)>/                    e.g. "ComicK Fanmade (EN)"
      <Manga Title>/                    e.g. "Yofukashi no Uta"
        Official_Chapter 1/             chapter folder — any prefix is fine
          001.webp
          ...
          ComicInfo.xml                 Suwayomi's own basic XML (read + replaced)

Kavita folder structure (output)
---------------------------------
  <kavita_path>/
    <Manga Title>/
      Official_Chapter 1.cbz           CBZ archive: images + enriched ComicInfo.xml
      Official_Chapter 2.cbz
      ...

Cover naming convention
-----------------------
The cover image is saved as "000.<ext>" inside each chapter folder so that
it sorts before "001.webp", "002.webp", … in alphabetical order.  This
ensures the <Pages Image="0" Type="FrontCover"> assignment in ComicInfo.xml
matches the actual file order inside the CBZ archive.

Dependencies
------------
    requests    -> pip install requests
    Pillow      -> pip install pillow   (optional, for image dimensions)

    ComicInfoBuilder, MangadexVolumeResolver, MangaBakaWorksResolver,
    MALResolver, KavitaPersonUpdater must reside in the same directory.
"""

from __future__ import annotations

import re
import shutil
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path

import requests

from ComicInfoBuilder import ComicInfoBuilder, _pick_cover_url
from MangadexVolumeResolver import MangaDexVolumeResolver
from MangaBakaWorksResolver import MangaBakaWorksResolver
from MALResolver import MALResolver
from AniListResolver import AniListResolver
from KavitaPersonUpdater import KavitaPersonUpdater
from MatchesCache import MatchesCache


_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".avif"}
_CHAPTER_RE = re.compile(r'[Cc]hapter\s+(\d+(?:\.\d+)?)')

# Parenthetical source labels that Suwayomi appends to series names.
# These are not part of the actual title and confuse MangaBaka searches.
_SOURCE_LABEL_RE = re.compile(
    r'\s*\(\s*(?:official|unofficial|fan(?:\s*made)?|scanlation|'
    r'bato(?:to)?|mangadex|manga\s*plus|viz|yen\s*press|webtoon)\s*\)\s*$',
    re.IGNORECASE,
)

# Characters that Windows (and SMB shares) forbid in path components.
_WIN_ILLEGAL_RE = re.compile(r'[\\/*?"<>|]')


def _natural_key(name: str) -> list:
    return [int(p) if p.isdigit() else p.lower()
            for p in re.split(r"(\d+)", name)]


def _sanitize_dirname(name: str) -> str:
    """
    Makes a string safe to use as a Windows (or SMB) directory name.

    Rules applied:
      - ": " or ":" surrounded by optional spaces -> " - "
        ("Call of the Night: Paradise Arc" -> "Call of the Night - Paradise Arc")
      - Remaining Windows-illegal chars (\\ / * ? " < > |) are stripped.
      - Leading/trailing dots and spaces are removed (Windows restriction).
    """
    name = re.sub(r"\s*:\s*", " - ", name)
    name = _WIN_ILLEGAL_RE.sub("", name)
    return name.strip(". ")


_SUWAYOMI_WANTED = {"Title", "Series", "Number", "Summary",
                    "Writer", "Penciller", "Genre", "Web",
                    "Year", "Month", "Day"}


def _read_suwayomi_fields(chapter_dir: Path) -> dict:
    """
    Reads metadata from Suwayomi's ComicInfo.xml inside a chapter folder.

    Returns a dict of whichever fields are present, e.g.:
      {"Number": "3", "Series": "Dungeon Odyssey", "Title": "Chapter 3", ...}
    Returns an empty dict if the file is missing or unparseable.
    """
    xml_path = chapter_dir / "ComicInfo.xml"
    if not xml_path.is_file():
        return {}
    try:
        root = ET.parse(xml_path).getroot()
    except ET.ParseError:
        return {}
    result = {}
    for child in root:
        tag = child.tag.split("}")[-1]
        if tag in _SUWAYOMI_WANTED and child.text and child.text.strip():
            result[tag] = child.text.strip()
    return result


def _clean_suwayomi_title(title: str) -> str:
    """
    Removes Suwayomi source annotations from a series title.

    Suwayomi sometimes appends the translation group / source type in
    parentheses, e.g. "Wistoria: Wand and Sword (Official)".  These labels
    are not part of the canonical title and break MangaBaka / MAL lookups.
    """
    return _SOURCE_LABEL_RE.sub("", title).strip()


def _mal_id_from_metadata(md: dict) -> "int | None":
    """Extracts the MAL ID from a MangaBaka series dict's source map."""
    for raw_key, info in (md.get("source") or {}).items():
        if re.sub(r"[^a-z0-9]", "", raw_key.lower()) in ("myanimelist", "mal"):
            if isinstance(info, dict):
                mal_id = info.get("id")
                if mal_id is not None:
                    try:
                        return int(mal_id)
                    except (TypeError, ValueError):
                        pass
    return None


def _al_id_from_metadata(md: dict) -> "int | None":
    """Extracts the AniList ID from a MangaBaka series dict's source map."""
    for raw_key, info in (md.get("source") or {}).items():
        if re.sub(r"[^a-z0-9]", "", raw_key.lower()) == "anilist":
            if isinstance(info, dict):
                al_id = info.get("id")
                if al_id is not None:
                    try:
                        return int(al_id)
                    except (TypeError, ValueError):
                        pass
    return None


def _chapter_image_size(chapter_dir: Path) -> int:
    """Returns the total file size of all images in a chapter folder."""
    return sum(
        f.stat().st_size
        for f in chapter_dir.iterdir()
        if f.is_file() and f.suffix.lower() in _IMAGE_EXTS
    )


def _deduplicate_chapters(
    chapter_items: list[tuple[Path, dict, str]],
) -> tuple[list[tuple[Path, dict, str]], list[Path]]:
    """
    When multiple chapter folders share the exact same chapter number
    (e.g. two folders for chapter "2" — not "2" vs "2.2"), keeps only the
    one with the highest total image file size, which is a reliable proxy
    for image quality.

    Chapter number comes from ComicInfo.xml <Number>; comparison is an exact
    string match so "2" and "2.2" are never considered duplicates.

    Returns
    -------
    kept     : deduplicated chapter_items list (original sort order preserved)
    rejected : Path list of lower-quality duplicate folders to be removed
    """
    best:      dict[str, tuple[Path, dict, str]] = {}
    best_size: dict[str, int]                    = {}
    rejected:  list[Path]                        = []

    for item in chapter_items:
        chapter_dir, fields, chapter_num = item
        size = _chapter_image_size(chapter_dir)

        if chapter_num not in best:
            best[chapter_num]      = item
            best_size[chapter_num] = size
        elif size > best_size[chapter_num]:
            prev_dir = best[chapter_num][0]
            print(f"  [dup] ch.{chapter_num}: replacing {prev_dir.name!r} "
                  f"({best_size[chapter_num]:,}B) with {chapter_dir.name!r} "
                  f"({size:,}B) — higher quality")
            rejected.append(prev_dir)
            best[chapter_num]      = item
            best_size[chapter_num] = size
        else:
            print(f"  [dup] ch.{chapter_num}: skipping {chapter_dir.name!r} "
                  f"({size:,}B), keeping {best[chapter_num][0].name!r} "
                  f"({best_size[chapter_num]:,}B)")
            rejected.append(chapter_dir)

    return list(best.values()), rejected


def _extract_chapter_num(folder_name: str) -> "str | None":
    """
    Fallback: extracts chapter number from the folder name.
    Examples: "Chapter 10" -> "10", "Official_Chapter 10.5" -> "10.5"
    """
    m = _CHAPTER_RE.search(folder_name)
    return m.group(1) if m else None


def _chapter_sort_key(folder_name: str) -> tuple:
    """Numeric sort key for chapter folder names."""
    num = _extract_chapter_num(folder_name)
    if num is None:
        return (float("inf"), folder_name)
    return (float(num), folder_name)


def _pack_to_cbz(folder: Path, dest: Path) -> None:
    """
    Packs all files in `folder` into a CBZ archive at `dest`.

    Images are stored in natural-sort order (so "000.jpg" < "001.webp").
    ComicInfo.xml is appended last so image indices in the archive match
    the <Pages> entries written by ComicInfoBuilder.
    Files are stored without compression (ZIP_STORED) since the source
    images are already compressed (webp / jpg / png / …).
    """
    images = sorted(
        [f for f in folder.iterdir()
         if f.is_file() and f.suffix.lower() in _IMAGE_EXTS],
        key=lambda p: _natural_key(p.name),
    )
    extras = [
        f for f in folder.iterdir()
        if f.is_file() and f.suffix.lower() not in _IMAGE_EXTS
    ]

    with zipfile.ZipFile(dest, "w", zipfile.ZIP_STORED) as zf:
        for f in images:
            zf.write(f, f.name)
        for f in extras:
            zf.write(f, f.name)


class SuwayomiMover:
    """
    Scans a Suwayomi download directory, generates enriched ComicInfo.xml
    for each chapter, packs each chapter folder into a CBZ archive, and
    moves the result to a Kavita library path.

    Parameters
    ----------
    suwayomi_path   : Root of Suwayomi downloads.
                      Expected layout: <root>/<Source>/<Title>/<Chapter N>/
    kavita_path     : Root of the Kavita library.
                      Series sub-directories are created automatically.
    kavita_base_url : Kavita server URL — required only for person sync,
                      e.g. "http://192.168.2.2:5000".
    kavita_api_key  : Kavita API key   — required only for person sync.
    language        : ComicInfo LanguageISO and SeriesSort language ("en").
    request_timeout : HTTP timeout in seconds for all API / image requests.
    delete_source   : Remove the source chapter folder after successful pack.
    """

    def __init__(self,
                 suwayomi_path,
                 kavita_path,
                 *,
                 kavita_base_url: "str | None" = None,
                 kavita_api_key: "str | None" = None,
                 language: str = "en",
                 request_timeout: int = 30,
                 delete_source: bool = True,
                 matches_cache: "MatchesCache | None" = None,
                 api_base_url: str = "https://api.mangabaka.dev/v1"):
        self._src = Path(suwayomi_path)
        self._dst = Path(kavita_path)
        self._language = language
        self._timeout = request_timeout
        self._delete_source = delete_source
        self._matches_cache = matches_cache
        self._api_base_url = api_base_url.rstrip("/")

        # Shared HTTP session and resolvers — reused across all series/chapters
        # to maximise cache hits and minimise API round-trips.
        session = requests.Session()
        session.headers.setdefault("User-Agent", "SuwayomiMover/1.0")
        self._session = session

        self._mal = MALResolver(request_timeout=request_timeout)
        self._al  = AniListResolver(request_timeout=request_timeout)
        self._vol_resolver = MangaDexVolumeResolver(
            request_timeout=request_timeout, session=session)
        self._works_resolver = MangaBakaWorksResolver(
            request_timeout=request_timeout, session=session)

        self._person_updater: "KavitaPersonUpdater | None" = None
        if kavita_base_url and kavita_api_key:
            self._person_updater = KavitaPersonUpdater(
                kavita_base_url, kavita_api_key,
                mal_resolver=self._mal,
                al_resolver=self._al,
                request_timeout=request_timeout)

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------
    def process_all(self) -> dict:
        """
        Processes every manga series found under the Suwayomi root.

        Walks two directory levels deep:
          <suwayomi_path>/<Source dir>/<Manga Title>/

        Returns a dict keyed by manga title, each value being the result
        dict from _process_series_dir.
        """
        results: dict = {}
        for source_dir in sorted(self._src.iterdir()):
            if not source_dir.is_dir():
                continue
            for manga_dir in sorted(source_dir.iterdir()):
                if not manga_dir.is_dir():
                    continue
                title = manga_dir.name
                print(f"[SuwayomiMover] {title}")
                results[title] = self._process_series_dir(manga_dir)
        return results

    def process_series(self, manga_title: str) -> dict:
        """
        Processes all chapters for a single series, located by title.

        Searches every source sub-directory under the Suwayomi root for a
        directory whose name matches `manga_title` exactly.
        Raises FileNotFoundError if no matching directory is found.
        """
        for source_dir in sorted(self._src.iterdir()):
            if not source_dir.is_dir():
                continue
            candidate = source_dir / manga_title
            if candidate.is_dir():
                return self._process_series_dir(candidate)
        raise FileNotFoundError(
            f"No Suwayomi directory found for '{manga_title}' under {self._src}")

    def build_matches_only(self) -> dict:
        """
        Walks every series under the Suwayomi root and resolves each one
        to a MangaBaka match — nothing else.

        For every series:
          - Reads the first chapter's ComicInfo.xml to obtain the canonical
            Series name (falls back to the folder name).
          - Cleans the name (strips source labels) the same way the real
            move pipeline does.
          - If the title is already in the matches cache, skips it.
          - Otherwise issues a MangaBaka search and adds the top hit to
            the cache (which is persisted to disk immediately).

        Returns the full cache contents as a Python dict.
        """
        if self._matches_cache is None:
            raise RuntimeError(
                "build_matches_only requires a MatchesCache instance")

        search_url = f"{self._api_base_url}/series/search"

        for source_dir in sorted(self._src.iterdir()):
            if not source_dir.is_dir():
                continue
            for manga_dir in sorted(source_dir.iterdir()):
                if not manga_dir.is_dir():
                    continue

                raw_series = manga_dir.name
                for chapter_dir in sorted(manga_dir.iterdir(),
                                          key=lambda p: _chapter_sort_key(p.name)):
                    if chapter_dir.is_dir():
                        fields = _read_suwayomi_fields(chapter_dir)
                        if fields.get("Series"):
                            raw_series = fields["Series"]
                            break

                builder_title = _clean_suwayomi_title(raw_series)

                if self._matches_cache.get(builder_title):
                    print(f"[matches] {builder_title} — cached")
                    continue

                print(f"[matches] {builder_title} — searching")
                try:
                    resp = self._session.get(
                        search_url,
                        params={"q": builder_title, "page": 1, "limit": 1},
                        timeout=self._timeout)
                    resp.raise_for_status()
                    data = resp.json().get("data") or []
                    if not data:
                        print(f"  [warn] no MangaBaka match for {builder_title!r}")
                        continue
                    series = data[0]
                    self._matches_cache.add(
                        builder_title,
                        mangabaka_id=series.get("id"),
                        mangabaka_name=series.get("title") or "",
                        image_url=_pick_cover_url(series.get("cover")),
                    )
                except Exception as exc:
                    print(f"  [warn] search failed for {builder_title!r}: {exc}")

        return self._matches_cache.all()

    # ------------------------------------------------------------------
    # Internal: series
    # ------------------------------------------------------------------
    def _process_series_dir(self, manga_dir: Path) -> dict:
        manga_title = manga_dir.name

        chapter_dirs = sorted(
            (d for d in manga_dir.iterdir() if d.is_dir()),
            key=lambda p: _chapter_sort_key(p.name),
        )

        # Read all chapter XMLs upfront to resolve chapter numbers and series name.
        chapter_items: list[tuple[Path, dict, str]] = []
        for chapter_dir in chapter_dirs:
            fields = _read_suwayomi_fields(chapter_dir)
            chapter_num = (fields.get("Number")
                           or _extract_chapter_num(chapter_dir.name))
            if chapter_num is None:
                print(f"  [skip] {chapter_dir.name} — no chapter number")
                continue
            chapter_items.append((chapter_dir, fields, chapter_num))

        chapter_items, rejected_dirs = _deduplicate_chapters(chapter_items)
        if self._delete_source:
            for d in rejected_dirs:
                shutil.rmtree(d, ignore_errors=True)

        # <Series> from the first chapter's XML → strip source labels → clean title
        # for the MangaBaka search.  Folder name is the last resort.
        raw_series = manga_title
        if chapter_items:
            xml_series = chapter_items[0][1].get("Series")
            if xml_series:
                raw_series = xml_series
        builder_title = _clean_suwayomi_title(raw_series)

        # One builder per series — metadata fetched once, reused for all chapters.
        builder = ComicInfoBuilder(
            builder_title, chapter=1,
            api_base_url=self._api_base_url,
            language=self._language,
            request_timeout=self._timeout,
            session=self._session,
            volume_resolver=self._vol_resolver,
            works_resolver=self._works_resolver,
            mal_resolver=self._mal,
            al_resolver=self._al,
            matches_cache=self._matches_cache,
        )

        # Fetch MangaBaka metadata now to get the canonical title and MAL ID.
        md: "dict | None" = None
        mangabaka_title = manga_title
        try:
            md = builder.fetch_metadata()
            mangabaka_title = md.get("title") or manga_title
        except Exception as exc:
            print(f"  [warn] metadata fetch failed: {exc}")

        # Destination folder uses the MangaBaka canonical title, sanitized for
        # Windows / SMB paths (no colons, illegal chars, leading/trailing dots).
        dest_series = self._dst / _sanitize_dirname(mangabaka_title)
        dest_series.mkdir(parents=True, exist_ok=True)

        chapter_results: list[dict] = []
        for chapter_dir, _fields, chapter_num in chapter_items:
            result = self._process_chapter(
                builder, chapter_num, chapter_dir, dest_series)
            chapter_results.append(result)
            status = "ok" if result["ok"] else f"ERROR: {result.get('error')}"
            print(f"  Chapter {chapter_num}: {status}")

        # Sync Kavita persons once per series.
        # Both MAL and AniList IDs come from MangaBaka's source map;
        # AniList is used as fallback when MAL returns no characters/staff.
        person_result: "dict | None" = None
        if self._person_updater:
            mal_id = (_mal_id_from_metadata(md) if md else None
                      or self._mal.find_mal_id(builder_title))
            al_id  = _al_id_from_metadata(md) if md else None
            if mal_id or al_id:
                try:
                    person_result = self._person_updater.update_for_manga(
                        mal_id, al_manga_id=al_id)
                    print(f"  Persons: chars={person_result['characters'].get('updated')} "
                          f"staff={person_result['staff'].get('updated')}")
                except Exception as exc:
                    person_result = {"error": str(exc)}
                    print(f"  Persons: ERROR {exc}")

        return {"chapters": chapter_results, "persons": person_result}

    # ------------------------------------------------------------------
    # Internal: chapter
    # ------------------------------------------------------------------
    def _process_chapter(self,
                         builder: ComicInfoBuilder,
                         chapter_num: str,
                         chapter_dir: Path,
                         dest_series: Path) -> dict:
        """
        Generates ComicInfo.xml for one chapter, packs it to CBZ, and
        optionally removes the source folder.

        The cover image is saved as "000.<ext>" so it sorts before the
        numbered story pages in the archive (ensuring Image=0 in the
        <Pages> element correctly points to the front cover).
        """
        cbz_path = dest_series / f"{chapter_dir.name}.cbz"
        try:
            builder.chapter = chapter_num
            builder.add_pages_from_folder(chapter_dir, cover_filename="000")
            builder.save_xml(chapter_dir)
            _pack_to_cbz(chapter_dir, cbz_path)
            if self._delete_source:
                shutil.rmtree(chapter_dir)
            return {"chapter": chapter_num, "cbz": str(cbz_path), "ok": True}
        except Exception as exc:
            return {"chapter": chapter_num, "cbz": str(cbz_path),
                    "ok": False, "error": str(exc)}


# --------------------------------------------------------------------------
# Usage example
# --------------------------------------------------------------------------
if __name__ == "__main__":
    SUWAYOMI_PATH = r"\\192.168.2.2\root\Temp\managdl\mangas"
    KAVITA_PATH   = r"\\192.168.2.2\root\ServerData\Kavita\test"
    KAVITA_URL    = "http://192.168.2.2:5000"
    KAVITA_KEY    = "Sq4a3hcV171dn3gzCl0K4eN7hZNk4sOA"

    mover = SuwayomiMover(
        SUWAYOMI_PATH,
        KAVITA_PATH,
        kavita_base_url=KAVITA_URL,
        kavita_api_key=KAVITA_KEY,
        delete_source=False
    )

    # Process a single series
    result = mover.process_series("Yofukashi no Uta")
    ok     = sum(1 for c in result["chapters"] if c["ok"])
    failed = sum(1 for c in result["chapters"] if not c["ok"])
    print(f"\nDone: {ok} ok, {failed} failed")
    for c in result["chapters"]:
        if not c["ok"]:
            print(f"  Chapter {c['chapter']}: {c['error']}")

    # Or process everything at once:
    # results = mover.process_all()