manga-mover-and-metadata-co…/src/manga/MangadexVolumeResolver.py

"""
mangadex_volume_resolver.py
===========================

Resolves chapter numbers to their corresponding volumes (tankobon) using
the public MangaDex API.

Background
----------
The MangaBaka API only provides series-level data. MangaDex, however,
stores a volume attribute per chapter. The endpoint

    GET /manga/{id}/aggregate

returns a chapter overview grouped by volume. This class encapsulates
that lookup so that `ComicInfoBuilder._determine_volume()` stays clean.

All available translations are queried (no language filter on the
aggregate endpoint) so that chapters only published in non-English
languages are still covered.

Chapter estimation
------------------
When a chapter is not present in the MangaDex aggregate at all (e.g.
because it has never been uploaded to MangaDex in any language), the
`estimate_volume_for_chapter()` method infers the most likely volume by
examining the known chapter-to-volume boundaries on both sides of the
target chapter. If MangaBaka page-count data is supplied, the page-count
per chapter is used to estimate where a volume boundary falls within the
gap; otherwise a simple midpoint heuristic is used.

Series relations
----------------
`get_series_relations()` returns related manga titles keyed by MangaDex
relationship type ("main_story", "spin_off", "sequel", …).  This is used
by `ComicInfoBuilder` to populate the `<SeriesGroup>` element.

Dependencies
------------
    requests        ->  pip install requests
"""

from __future__ import annotations

import difflib

import requests


def _normalise_chapter(value) -> str:
    """
    Converts a chapter number into a canonical comparison string.

    Examples:  1    -> "1"    |  1.0  -> "1"   |  "01"   -> "1"
               1.5  -> "1.5"  |  "1.50" -> "1.5"
    """
    text = str(value).strip()
    try:
        number = float(text)
    except ValueError:
        return text.lower()
    if number.is_integer():
        return str(int(number))
    return ("%f" % number).rstrip("0").rstrip(".")


class MangaDexVolumeResolver:
    """
    Resolves chapter numbers to their volume numbers via the MangaDex API.

    Typical usage
    -------------
        resolver = MangaDexVolumeResolver()
        manga_id = resolver.find_manga_id("Yofukashi no Uta")
        volume   = resolver.volume_for_chapter(manga_id, 1)
    """

    def __init__(self, *,
                 base_url: str = "https://api.mangadex.org",
                 request_timeout: int = 30,
                 session: "requests.Session | None" = None):
        """
        base_url        : Base URL of the MangaDex API.
        request_timeout : HTTP request timeout in seconds.
        session         : Optional reusable requests.Session.
        """
        self.base_url = base_url.rstrip("/")
        self.request_timeout = request_timeout
        self._session = session or requests.Session()
        self._session.headers.setdefault("User-Agent",
                                         "MangaDexVolumeResolver/1.0")
        # Cache: manga_id -> {chapter_number: volume}
        self._cache: dict[str, dict] = {}
        # Cache: manga_id -> {relation_type: [title, ...]}
        self._relations_cache: dict[str, dict] = {}

    # ----------------------------------------------------------------------
    # Locate the manga ID
    # ----------------------------------------------------------------------
    def find_manga_id(self, title: str) -> "str | None":
        """
        Searches MangaDex for `title` and returns the best matching manga
        ID, or None if no result is found.
        """
        if not title or not title.strip():
            return None

        resp = self._session.get(
            f"{self.base_url}/manga",
            params={"title": title, "limit": 5,
                    "contentRating[]": ["safe", "suggestive",
                                        "erotica", "pornographic"]},
            timeout=self.request_timeout)
        resp.raise_for_status()
        results = resp.json().get("data") or []
        if not results:
            return None

        def score(entry) -> float:
            attrs = entry.get("attributes", {})
            names: list[str] = []
            names.extend(str(v) for v in (attrs.get("title") or {}).values())
            for alt in (attrs.get("altTitles") or []):
                names.extend(str(v) for v in alt.values())
            best = 0.0
            for name in names:
                ratio = difflib.SequenceMatcher(
                    None, title.lower(), name.lower()).ratio()
                best = max(best, ratio)
            return best

        results.sort(key=score, reverse=True)
        return results[0].get("id")

    # ----------------------------------------------------------------------
    # Main function: retrieve and return volume / chapter data
    # ----------------------------------------------------------------------
    def get_chapter_volume_map(self, manga_id: str, *,
                               use_cache: bool = True) -> dict:
        """
        Retrieves the complete chapter-to-volume mapping for a series.

        All available languages are queried so that chapters only published
        in non-English translations are still included.

        Returns: dict  { chapter_number (str) : volume (str) or None }
        Example:       { "1": "1", "2": "1", "11": "2", "57": None }

        Chapters without a volume assignment are mapped to None.
        """
        if not manga_id:
            return {}
        if use_cache and manga_id in self._cache:
            return self._cache[manga_id]

        # No language filter: query all available translations so that every
        # chapter appears in the aggregate, regardless of translation status.
        resp = self._session.get(
            f"{self.base_url}/manga/{manga_id}/aggregate",
            timeout=self.request_timeout)
        resp.raise_for_status()
        volumes = resp.json().get("volumes") or {}

        chapter_map: dict[str, "str | None"] = {}
        for volume_key, volume_data in volumes.items():
            if str(volume_key).lower() in ("none", ""):
                volume_value = None
            else:
                volume_value = str(volume_data.get("volume") or volume_key)

            for chapter_key in (volume_data.get("chapters") or {}):
                chapter_map[_normalise_chapter(chapter_key)] = volume_value

        if use_cache:
            self._cache[manga_id] = chapter_map
        return chapter_map

    # ----------------------------------------------------------------------
    # Convenience: look up the volume for a single chapter number
    # ----------------------------------------------------------------------
    def volume_for_chapter(self, manga_id: str, chapter,
                           *, use_cache: bool = True,
                           volume_page_counts: "dict | None" = None) -> "str | None":
        """
        Returns the volume for the given chapter number.

        Falls back to `estimate_volume_for_chapter` when the chapter is not
        directly present in the MangaDex aggregate.

        volume_page_counts : optional {volume_str: page_count} dict from
                             MangaBakaWorksResolver.get_page_counts().
                             Improves estimation accuracy when provided.
        """
        chapter_map = self.get_chapter_volume_map(manga_id, use_cache=use_cache)
        result = chapter_map.get(_normalise_chapter(chapter))
        if result is None and chapter_map:
            result = self.estimate_volume_for_chapter(
                chapter_map, chapter, volume_page_counts)
        return result

    # ----------------------------------------------------------------------
    # Chapter estimation for unmapped chapters
    # ----------------------------------------------------------------------
    def estimate_volume_for_chapter(self, chapter_map: dict, chapter,
                                    volume_page_counts: "dict | None" = None,
                                    ) -> "str | None":
        """
        Estimates the volume for a chapter that is absent from chapter_map.

        Algorithm
        ---------
        1. Sort all chapters that have a known volume assignment.
        2. Find the nearest mapped chapters before and after the target.
        3. If both neighbors belong to the same volume -> return that volume.
        4. If they differ (volume boundary somewhere in the gap):
           a. If page-count data is provided, estimate where the boundary
              falls based on average pages-per-chapter and remaining page
              budget of the left volume.
           b. Otherwise use a midpoint heuristic (favour the left volume).

        Returns None if no suitable estimate can be made.
        """
        target = float(_normalise_chapter(chapter))

        known = sorted(
            [(float(k), v) for k, v in chapter_map.items() if v is not None],
            key=lambda x: x[0],
        )
        if not known:
            return None

        # Insertion point: first index where known[i][0] > target
        pos = next((i for i, (c, _) in enumerate(known) if c > target),
                   len(known))

        if pos == 0:
            return known[0][1]
        if pos == len(known):
            return known[-1][1]

        ch_left, vol_left = known[pos - 1]
        ch_right, vol_right = known[pos]

        if vol_left == vol_right:
            return vol_left

        # Volume boundary lies somewhere in (ch_left, ch_right)
        vol_left_chapters = [c for c, v in known if v == vol_left]

        if volume_page_counts:
            # Estimate average pages per chapter across all known volumes.
            total_pages = sum(volume_page_counts.values())
            total_chapters = len(known)
            avg_pages = total_pages / total_chapters if total_chapters else 20.0

            left_vol_pages = volume_page_counts.get(vol_left)
            if left_vol_pages:
                expected_chaps = max(len(vol_left_chapters),
                                     round(left_vol_pages / avg_pages))
                remaining_slots = expected_chaps - len(vol_left_chapters)
                boundary = max(vol_left_chapters) + max(0, remaining_slots)
                return vol_left if target <= boundary else vol_right

        # Fallback: use average volume size to estimate the boundary.
        vol_sizes: dict[str, int] = {}
        for _, v in known:
            if v:
                vol_sizes[v] = vol_sizes.get(v, 0) + 1
        avg_size = sum(vol_sizes.values()) / len(vol_sizes) if vol_sizes else 10.0
        boundary = ch_left + max(1.0, avg_size - len(vol_left_chapters))
        return vol_left if target <= boundary else vol_right

    # ----------------------------------------------------------------------
    # Related series (for SeriesGroup)
    # ----------------------------------------------------------------------
    def get_series_relations(self, manga_id: str) -> "dict[str, list[str]]":
        """
        Returns related manga titles grouped by relationship type.

        Example return value:
            {"main_story": ["Call of the Night"], "spin_off": ["Side Story A"]}

        The MangaDex `?includes[]=manga` parameter is used to embed
        related manga attributes so their titles are available without
        additional requests.
        """
        if not manga_id:
            return {}

        if manga_id in self._relations_cache:
            return self._relations_cache[manga_id]

        try:
            resp = self._session.get(
                f"{self.base_url}/manga/{manga_id}",
                params={"includes[]": "manga"},
                timeout=self.request_timeout,
            )
            resp.raise_for_status()
            data = resp.json().get("data") or {}
        except requests.RequestException:
            return {}

        relations: dict[str, list[str]] = {}
        for rel in (data.get("relationships") or []):
            if rel.get("type") != "manga":
                continue
            rel_type = rel.get("related")
            if not rel_type:
                continue
            attrs = rel.get("attributes") or {}
            if not attrs:
                continue
            titles: dict = attrs.get("title") or {}
            # Prefer English, then romanized Japanese, then any available
            title = (titles.get("en")
                     or titles.get("ja-ro")
                     or next(iter(titles.values()), None))
            if title:
                relations.setdefault(rel_type, []).append(title)

        self._relations_cache[manga_id] = relations
        return relations

    # ----------------------------------------------------------------------
    def clear_cache(self) -> None:
        """Clears all internal caches."""
        self._cache.clear()
        self._relations_cache.clear()


# --------------------------------------------------------------------------
# Usage example
# --------------------------------------------------------------------------
if __name__ == "__main__":
    resolver = MangaDexVolumeResolver()

    mid = resolver.find_manga_id("Yofukashi no Uta")
    print("MangaDex ID  :", mid)

    if mid:
        print("Volume for ch. 1  :", resolver.volume_for_chapter(mid, 66))
        print("Full chapter map  :", resolver.get_chapter_volume_map(mid))
        print("Relations         :", resolver.get_series_relations(mid))