merged ln metadata into manga mover

2026-06-14 10:47:47 +02:00
parent 8a44b85a48
commit 216771f709
27 changed files with 3040 additions and 280 deletions
@@ -0,0 +1,344 @@
+"""
+mangadex_volume_resolver.py
+===========================
+
+Resolves chapter numbers to their corresponding volumes (tankobon) using
+the public MangaDex API.
+
+Background
+----------
+The MangaBaka API only provides series-level data. MangaDex, however,
+stores a volume attribute per chapter. The endpoint
+
+    GET /manga/{id}/aggregate
+
+returns a chapter overview grouped by volume. This class encapsulates
+that lookup so that `ComicInfoBuilder._determine_volume()` stays clean.
+
+All available translations are queried (no language filter on the
+aggregate endpoint) so that chapters only published in non-English
+languages are still covered.
+
+Chapter estimation
+------------------
+When a chapter is not present in the MangaDex aggregate at all (e.g.
+because it has never been uploaded to MangaDex in any language), the
+`estimate_volume_for_chapter()` method infers the most likely volume by
+examining the known chapter-to-volume boundaries on both sides of the
+target chapter. If MangaBaka page-count data is supplied, the page-count
+per chapter is used to estimate where a volume boundary falls within the
+gap; otherwise a simple midpoint heuristic is used.
+
+Series relations
+----------------
+`get_series_relations()` returns related manga titles keyed by MangaDex
+relationship type ("main_story", "spin_off", "sequel", …).  This is used
+by `ComicInfoBuilder` to populate the `<SeriesGroup>` element.
+
+Dependencies
+------------
+    requests        ->  pip install requests
+"""
+
+from __future__ import annotations
+
+import difflib
+
+import requests
+
+
+def _normalise_chapter(value) -> str:
+    """
+    Converts a chapter number into a canonical comparison string.
+
+    Examples:  1    -> "1"    |  1.0  -> "1"   |  "01"   -> "1"
+               1.5  -> "1.5"  |  "1.50" -> "1.5"
+    """
+    text = str(value).strip()
+    try:
+        number = float(text)
+    except ValueError:
+        return text.lower()
+    if number.is_integer():
+        return str(int(number))
+    return ("%f" % number).rstrip("0").rstrip(".")
+
+
+class MangaDexVolumeResolver:
+    """
+    Resolves chapter numbers to their volume numbers via the MangaDex API.
+
+    Typical usage
+    -------------
+        resolver = MangaDexVolumeResolver()
+        manga_id = resolver.find_manga_id("Yofukashi no Uta")
+        volume   = resolver.volume_for_chapter(manga_id, 1)
+    """
+
+    def __init__(self, *,
+                 base_url: str = "https://api.mangadex.org",
+                 request_timeout: int = 30,
+                 session: "requests.Session | None" = None):
+        """
+        base_url        : Base URL of the MangaDex API.
+        request_timeout : HTTP request timeout in seconds.
+        session         : Optional reusable requests.Session.
+        """
+        self.base_url = base_url.rstrip("/")
+        self.request_timeout = request_timeout
+        self._session = session or requests.Session()
+        self._session.headers.setdefault("User-Agent",
+                                         "MangaDexVolumeResolver/1.0")
+        # Cache: manga_id -> {chapter_number: volume}
+        self._cache: dict[str, dict] = {}
+        # Cache: manga_id -> {relation_type: [title, ...]}
+        self._relations_cache: dict[str, dict] = {}
+
+    # ----------------------------------------------------------------------
+    # Locate the manga ID
+    # ----------------------------------------------------------------------
+    def find_manga_id(self, title: str) -> "str | None":
+        """
+        Searches MangaDex for `title` and returns the best matching manga
+        ID, or None if no result is found.
+        """
+        if not title or not title.strip():
+            return None
+
+        resp = self._session.get(
+            f"{self.base_url}/manga",
+            params={"title": title, "limit": 5,
+                    "contentRating[]": ["safe", "suggestive",
+                                        "erotica", "pornographic"]},
+            timeout=self.request_timeout)
+        resp.raise_for_status()
+        results = resp.json().get("data") or []
+        if not results:
+            return None
+
+        def score(entry) -> float:
+            attrs = entry.get("attributes", {})
+            names: list[str] = []
+            names.extend(str(v) for v in (attrs.get("title") or {}).values())
+            for alt in (attrs.get("altTitles") or []):
+                names.extend(str(v) for v in alt.values())
+            best = 0.0
+            for name in names:
+                ratio = difflib.SequenceMatcher(
+                    None, title.lower(), name.lower()).ratio()
+                best = max(best, ratio)
+            return best
+
+        results.sort(key=score, reverse=True)
+        return results[0].get("id")
+
+    # ----------------------------------------------------------------------
+    # Main function: retrieve and return volume / chapter data
+    # ----------------------------------------------------------------------
+    def get_chapter_volume_map(self, manga_id: str, *,
+                               use_cache: bool = True) -> dict:
+        """
+        Retrieves the complete chapter-to-volume mapping for a series.
+
+        All available languages are queried so that chapters only published
+        in non-English translations are still included.
+
+        Returns: dict  { chapter_number (str) : volume (str) or None }
+        Example:       { "1": "1", "2": "1", "11": "2", "57": None }
+
+        Chapters without a volume assignment are mapped to None.
+        """
+        if not manga_id:
+            return {}
+        if use_cache and manga_id in self._cache:
+            return self._cache[manga_id]
+
+        # No language filter: query all available translations so that every
+        # chapter appears in the aggregate, regardless of translation status.
+        resp = self._session.get(
+            f"{self.base_url}/manga/{manga_id}/aggregate",
+            timeout=self.request_timeout)
+        resp.raise_for_status()
+        volumes = resp.json().get("volumes") or {}
+
+        chapter_map: dict[str, "str | None"] = {}
+        for volume_key, volume_data in volumes.items():
+            if str(volume_key).lower() in ("none", ""):
+                volume_value = None
+            else:
+                volume_value = str(volume_data.get("volume") or volume_key)
+
+            for chapter_key in (volume_data.get("chapters") or {}):
+                chapter_map[_normalise_chapter(chapter_key)] = volume_value
+
+        if use_cache:
+            self._cache[manga_id] = chapter_map
+        return chapter_map
+
+    # ----------------------------------------------------------------------
+    # Convenience: look up the volume for a single chapter number
+    # ----------------------------------------------------------------------
+    def volume_for_chapter(self, manga_id: str, chapter,
+                           *, use_cache: bool = True,
+                           volume_page_counts: "dict | None" = None) -> "str | None":
+        """
+        Returns the volume for the given chapter number.
+
+        Falls back to `estimate_volume_for_chapter` when the chapter is not
+        directly present in the MangaDex aggregate.
+
+        volume_page_counts : optional {volume_str: page_count} dict from
+                             MangaBakaWorksResolver.get_page_counts().
+                             Improves estimation accuracy when provided.
+        """
+        chapter_map = self.get_chapter_volume_map(manga_id, use_cache=use_cache)
+        result = chapter_map.get(_normalise_chapter(chapter))
+        if result is None and chapter_map:
+            result = self.estimate_volume_for_chapter(
+                chapter_map, chapter, volume_page_counts)
+        return result
+
+    # ----------------------------------------------------------------------
+    # Chapter estimation for unmapped chapters
+    # ----------------------------------------------------------------------
+    def estimate_volume_for_chapter(self, chapter_map: dict, chapter,
+                                    volume_page_counts: "dict | None" = None,
+                                    ) -> "str | None":
+        """
+        Estimates the volume for a chapter that is absent from chapter_map.
+
+        Algorithm
+        ---------
+        1. Sort all chapters that have a known volume assignment.
+        2. Find the nearest mapped chapters before and after the target.
+        3. If both neighbors belong to the same volume -> return that volume.
+        4. If they differ (volume boundary somewhere in the gap):
+           a. If page-count data is provided, estimate where the boundary
+              falls based on average pages-per-chapter and remaining page
+              budget of the left volume.
+           b. Otherwise use a midpoint heuristic (favour the left volume).
+
+        Returns None if no suitable estimate can be made.
+        """
+        target = float(_normalise_chapter(chapter))
+
+        known = sorted(
+            [(float(k), v) for k, v in chapter_map.items() if v is not None],
+            key=lambda x: x[0],
+        )
+        if not known:
+            return None
+
+        # Insertion point: first index where known[i][0] > target
+        pos = next((i for i, (c, _) in enumerate(known) if c > target),
+                   len(known))
+
+        if pos == 0:
+            return known[0][1]
+        if pos == len(known):
+            return known[-1][1]
+
+        ch_left, vol_left = known[pos - 1]
+        ch_right, vol_right = known[pos]
+
+        if vol_left == vol_right:
+            return vol_left
+
+        # Volume boundary lies somewhere in (ch_left, ch_right)
+        vol_left_chapters = [c for c, v in known if v == vol_left]
+
+        if volume_page_counts:
+            # Estimate average pages per chapter across all known volumes.
+            total_pages = sum(volume_page_counts.values())
+            total_chapters = len(known)
+            avg_pages = total_pages / total_chapters if total_chapters else 20.0
+
+            left_vol_pages = volume_page_counts.get(vol_left)
+            if left_vol_pages:
+                expected_chaps = max(len(vol_left_chapters),
+                                     round(left_vol_pages / avg_pages))
+                remaining_slots = expected_chaps - len(vol_left_chapters)
+                boundary = max(vol_left_chapters) + max(0, remaining_slots)
+                return vol_left if target <= boundary else vol_right
+
+        # Fallback: use average volume size to estimate the boundary.
+        vol_sizes: dict[str, int] = {}
+        for _, v in known:
+            if v:
+                vol_sizes[v] = vol_sizes.get(v, 0) + 1
+        avg_size = sum(vol_sizes.values()) / len(vol_sizes) if vol_sizes else 10.0
+        boundary = ch_left + max(1.0, avg_size - len(vol_left_chapters))
+        return vol_left if target <= boundary else vol_right
+
+    # ----------------------------------------------------------------------
+    # Related series (for SeriesGroup)
+    # ----------------------------------------------------------------------
+    def get_series_relations(self, manga_id: str) -> "dict[str, list[str]]":
+        """
+        Returns related manga titles grouped by relationship type.
+
+        Example return value:
+            {"main_story": ["Call of the Night"], "spin_off": ["Side Story A"]}
+
+        The MangaDex `?includes[]=manga` parameter is used to embed
+        related manga attributes so their titles are available without
+        additional requests.
+        """
+        if not manga_id:
+            return {}
+
+        if manga_id in self._relations_cache:
+            return self._relations_cache[manga_id]
+
+        try:
+            resp = self._session.get(
+                f"{self.base_url}/manga/{manga_id}",
+                params={"includes[]": "manga"},
+                timeout=self.request_timeout,
+            )
+            resp.raise_for_status()
+            data = resp.json().get("data") or {}
+        except requests.RequestException:
+            return {}
+
+        relations: dict[str, list[str]] = {}
+        for rel in (data.get("relationships") or []):
+            if rel.get("type") != "manga":
+                continue
+            rel_type = rel.get("related")
+            if not rel_type:
+                continue
+            attrs = rel.get("attributes") or {}
+            if not attrs:
+                continue
+            titles: dict = attrs.get("title") or {}
+            # Prefer English, then romanized Japanese, then any available
+            title = (titles.get("en")
+                     or titles.get("ja-ro")
+                     or next(iter(titles.values()), None))
+            if title:
+                relations.setdefault(rel_type, []).append(title)
+
+        self._relations_cache[manga_id] = relations
+        return relations
+
+    # ----------------------------------------------------------------------
+    def clear_cache(self) -> None:
+        """Clears all internal caches."""
+        self._cache.clear()
+        self._relations_cache.clear()
+
+
+# --------------------------------------------------------------------------
+# Usage example
+# --------------------------------------------------------------------------
+if __name__ == "__main__":
+    resolver = MangaDexVolumeResolver()
+
+    mid = resolver.find_manga_id("Yofukashi no Uta")
+    print("MangaDex ID  :", mid)
+
+    if mid:
+        print("Volume for ch. 1  :", resolver.volume_for_chapter(mid, 66))
+        print("Full chapter map  :", resolver.get_chapter_volume_map(mid))
+        print("Relations         :", resolver.get_series_relations(mid))