""" light_novel_metadata_builder.py =============================== Fetches series-level metadata for a light novel from MangaBaka, enriches it with MyAnimeList / AniList tracker statistics and character data, and returns a structured dict ready to be diffed against Kavita's SeriesMetadataDto. Differences vs. the manga project's ComicInfoBuilder: - No chapter / page handling — Kavita reads volumes from the files. - No XML output — produces a plain dict. - No MangaDex resolver — light novels don't have a chapter→volume mapping problem. - MangaBaka search type is fixed to ``novel`` so only light/web novels are returned. """ from __future__ import annotations import re import requests from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit from MALResolver import MALResolver from AniListResolver import AniListResolver from MatchesCache import MatchesCache from TextUtils import paragraphs_to_html, person_name_with_id # MangaBaka series type for the search endpoint. _SEARCH_TYPES = ["novel"] # MangaBaka content_rating -> Kavita AgeRating enum # Kavita AgeRating values (from openapi.json): # 0=Unknown, 3=Everyone, 8=Teen, 10=Mature17Plus, 13=AdultsOnly _AGE_RATING_MAP = { "safe": 3, # Everyone "suggestive": 8, # Teen "erotica": 10, # Mature17Plus "pornographic": 13, # AdultsOnly } # MangaBaka status -> Kavita PublicationStatus enum # Kavita PublicationStatus (from openapi.json): # 0=OnGoing, 1=Hiatus, 2=Completed, 3=Cancelled, 4=Ended _PUB_STATUS_MAP = { "ongoing": 0, "hiatus": 1, "completed": 2, "cancelled": 3, "ended": 4, } # External-tracker URL templates used to enrich the web-links list. _TRACKER_URL_TEMPLATES = { "anilist": "https://anilist.co/manga/{id}", "myanimelist": "https://myanimelist.net/manga/{id}", "mal": "https://myanimelist.net/manga/{id}", "mangaupdates": "https://www.mangaupdates.com/series.html?id={id}", "kitsu": "https://kitsu.app/manga/{id}", "animenewsnetwork": "https://www.animenewsnetwork.com/encyclopedia/manga.php?id={id}", "ann": "https://www.animenewsnetwork.com/encyclopedia/manga.php?id={id}", "animeplanet": "https://www.anime-planet.com/manga/{id}", "shikimori": "https://shikimori.one/mangas/{id}", "bookwalker": "https://bookwalker.jp/{id}", } _MD_ESCAPE_RE = re.compile(r'\\([\\`*_{}\[\]()\#+\-.!|~])') # -------------------------------------------------------------------------- # Helpers # -------------------------------------------------------------------------- def _normalise_key(key) -> str: return re.sub(r"[^a-z0-9]", "", str(key).lower()) def _format_term(value: str) -> str: return str(value).replace("_", " ").strip().title() if value else "" def _md_to_html(text: str) -> str: """Converts the subset of Markdown produced by MangaBaka to compact HTML.""" if not text: return "" text = _MD_ESCAPE_RE.sub(r'\1', text) text = re.sub( r'\[([^\]]+)\]\(([^)]+)\)', lambda m: f'{m.group(1)}', text, ) text = re.sub(r'\*\*(.+?)\*\*', r'\1', text, flags=re.DOTALL) text = re.sub(r'\*(.+?)\*', r'\1', text, flags=re.DOTALL) return paragraphs_to_html(text) def pick_cover_url(cover) -> "str | None": """Selects the best cover URL from a MangaBaka cover object.""" if not cover: return None if isinstance(cover, str): return cover if not isinstance(cover, dict): return None raw = cover.get("raw") if isinstance(raw, dict): url = raw.get("url") if isinstance(url, str) and url: return url elif isinstance(raw, str) and raw: return raw for size_key in ("x350", "x250", "x150"): variant = cover.get(size_key) if isinstance(variant, dict): for density in ("x3", "x2", "x1"): url = variant.get(density) if isinstance(url, str) and url: return url elif isinstance(variant, str) and variant: return variant for val in cover.values(): if isinstance(val, str) and val.startswith("http"): return val if isinstance(val, dict): for sub in val.values(): if isinstance(sub, str) and sub.startswith("http"): return sub return None def pick_thumbnail_url(cover) -> "str | None": """Picks a small cover variant suitable for a UI thumbnail.""" if not cover: return None if isinstance(cover, str): return cover if not isinstance(cover, dict): return None for size_key in ("x150", "x250", "x350"): variant = cover.get(size_key) if isinstance(variant, dict): for density in ("x2", "x1", "x3"): url = variant.get(density) if isinstance(url, str) and url: return url elif isinstance(variant, str) and variant: return variant return pick_cover_url(cover) def _id_from_source(md: dict, *names: str) -> "int | None": target = {_normalise_key(n) for n in names} for raw_key, info in (md.get("source") or {}).items(): if _normalise_key(raw_key) in target and isinstance(info, dict): mid = info.get("id") if mid is not None: try: return int(mid) except (TypeError, ValueError): pass return None # -------------------------------------------------------------------------- # Builder # -------------------------------------------------------------------------- class LightNovelMetadataBuilder: """ Resolves a light-novel series on MangaBaka and produces a structured metadata dict ready to be merged into Kavita. """ def __init__(self, *, api_base_url: str = "https://api.mangabaka.dev/v1", language: str = "en", request_timeout: int = 30, session: "requests.Session | None" = None, mal_resolver: "MALResolver | None" = None, al_resolver: "AniListResolver | None" = None, matches_cache: "MatchesCache | None" = None): self.api_base_url = api_base_url.rstrip("/") self.language = language self.request_timeout = request_timeout self._session = session or requests.Session() self._session.headers.setdefault("User-Agent", "LightNovelMetadataBuilder/1.0") _apply_mangabaka_rate_limit(self._session) self._mal = mal_resolver or MALResolver( request_timeout=request_timeout, search_type="lightnovel") self._al = al_resolver or AniListResolver( request_timeout=request_timeout, media_format="novel") self._matches_cache = matches_cache # ------------------------------------------------------------------ # MangaBaka search / fetch # ------------------------------------------------------------------ def search_series(self, title: str) -> "dict | None": """Returns the top MangaBaka novel hit for `title`, or None.""" if not title or not title.strip(): return None url = f"{self.api_base_url}/series/search" try: resp = self._session.get( url, params={"q": title, "type": _SEARCH_TYPES, "page": 1, "limit": 1}, timeout=self.request_timeout) resp.raise_for_status() except requests.RequestException: return None data = resp.json().get("data") or [] return data[0] if data else None def fetch_series(self, series_id) -> "dict | None": """ Returns the full MangaBaka series dict for the given id, following ``merged_with`` redirects. A seen-set guards against merge cycles. """ if series_id is None or str(series_id).strip() == "": return None seen: set[str] = set() current = series_id while str(current) not in seen: seen.add(str(current)) url = f"{self.api_base_url}/series/{current}" resp = self._session.get(url, timeout=self.request_timeout) resp.raise_for_status() data = resp.json().get("data") if data and data.get("state") == "merged" and data.get("merged_with"): current = data["merged_with"] continue return data return None # ------------------------------------------------------------------ # Resolve title -> MangaBaka series (caches the match) # ------------------------------------------------------------------ def resolve(self, title: str) -> "dict | None": """ Returns the MangaBaka series for `title`. Lookup order: 1. MatchesCache (uses stored mangabakaId, skips the search). 2. Fresh MangaBaka search — top hit. Result is persisted to the cache so it survives a crash. """ if self._matches_cache is not None: cached = self._matches_cache.get(title) if cached and cached.get("mangabakaId"): try: series = self.fetch_series(cached["mangabakaId"]) if series: return series except Exception: pass series = self.search_series(title) if series and self._matches_cache is not None: self._matches_cache.upsert( title, mangabaka_id=series.get("id"), mangabaka_name=series.get("title") or "", image_url=pick_thumbnail_url(series.get("cover")), ) return series # ------------------------------------------------------------------ # Main entry point # ------------------------------------------------------------------ def build(self, *, title: str = "", mangabaka_id=None) -> "dict | None": """ Fetches and enriches metadata for one series, returning the normalised dict described in the module docstring. Pass either `title` (will resolve via cache/search) or `mangabaka_id` (direct fetch). """ if mangabaka_id is not None and str(mangabaka_id).strip(): md = self.fetch_series(mangabaka_id) else: md = self.resolve(title) if not md: return None return self._assemble(md) # ------------------------------------------------------------------ # Internal: assemble the result dict # ------------------------------------------------------------------ def _assemble(self, md: dict) -> dict: mal_id = _id_from_source(md, "myanimelist", "mal") al_id = _id_from_source(md, "anilist") # Fall back to a title-based MAL lookup when the source map does # not carry an id — Jikan is the only tracker that ships staff # data we can use to enrich author / artist person records. if mal_id is None: mal_id = self._mal.find_mal_id(md.get("title") or "") mal_stats = self._mal.get_stats(mal_id) if mal_id else None characters_detailed = self._mal.get_characters_detailed(mal_id) if mal_id else [] if not characters_detailed and al_id: characters_detailed = self._al.get_characters_detailed(al_id) staff_detailed = self._mal.get_staff_detailed(mal_id) if mal_id else [] if not staff_detailed and al_id: staff_detailed = self._al.get_staff_detailed(al_id) # Character names for SeriesMetadata, disambiguated with the # tracker character id ("Rem (MAL 118737)") because Kavita person # records are global and keyed by name only. character_names = [ person_name_with_id(c["name"], mal_id=c.get("mal_id"), al_id=c.get("al_id")) for c in characters_detailed if c.get("name") ] # Writers come from MangaBaka first (authoritative for novels) writers = list(md.get("authors") or []) # Illustrators / artists -> CoverArtists (Kavita has no dedicated # illustrator field, and Pencillers is the wrong semantic for # text-only novels). cover_artists = list(md.get("artists") or []) # Publisher: prefer English licence, else original. When both # exist, the original publisher becomes the imprint. english_pubs = self._publishers_by_type(md, "English") original_pubs = self._publishers_by_type(md, "Original") publishers = english_pubs or original_pubs imprint = original_pubs[0] if english_pubs and original_pubs else None # Release year release_year = None try: if md.get("year") is not None: release_year = int(md["year"]) except (TypeError, ValueError): pass # Score: MangaBaka rating is 0..100 -> Kavita userRating is 0..5 score = None if md.get("rating") is not None: try: score = round(float(md["rating"]) / 20.0, 1) except (TypeError, ValueError): pass # Tags / genres come back as snake_case slugs. genres = [_format_term(g) for g in (md.get("genres") or []) if g] tags = [_format_term(t) for t in (md.get("tags") or []) if t] # Web links web_links = self._collect_web_links(md) # Summary HTML summary = self._build_summary(md, mal_stats) # Cover URL cover_url = pick_cover_url(md.get("cover")) # Title variants all_alt = self._collect_all_alt_titles(md) return { "mangabakaId": str(md.get("id") or ""), "mangabakaTitle": md.get("title") or "", "originalName": md.get("native_title") or "", "localizedName": md.get("romanized_title") or "", "sortName": self._sort_title(md), "altTitles": all_alt, "summary": summary, "genres": genres, "tags": tags, "characters": character_names, "writers": writers, "coverArtists": cover_artists, "publishers": publishers, "imprint": imprint, "releaseYear": release_year, "ageRating": _AGE_RATING_MAP.get(md.get("content_rating"), 0), "publicationStatus": _PUB_STATUS_MAP.get( (md.get("status") or "").lower(), 0), "language": self.language, "webLinks": web_links, "score": score, "coverUrl": cover_url, "malId": mal_id, "anilistId": al_id, "relationships": list(md.get("relationships_v2") or []), "charactersDetailed": characters_detailed, "staffDetailed": staff_detailed, "raw": md, } # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ @staticmethod def _publishers_by_type(md: dict, ptype: str) -> list[str]: return [p.get("name") for p in (md.get("publishers") or []) if p.get("type") == ptype and p.get("name")] def _sort_title(self, md: dict) -> str: lang = self.language.lower() alts = self._collect_alt_titles(md) return alts.get(lang) or md.get("title") or "" def _collect_alt_titles(self, md: dict) -> "dict[str, str]": """Returns one best title per language code (en/de/jp/romaji).""" titles = md.get("titles") or md.get("alt_titles") or [] def pick(language_codes: tuple, prefer_trait: "str | None" = None ) -> "str | None": best_score = -1 best_title: "str | None" = None for entry in titles: if not isinstance(entry, dict): continue lang = (entry.get("language") or entry.get("lang") or "").lower() if lang not in language_codes: continue title = entry.get("title") if not title: continue traits = entry.get("traits") or [] score = 0 if prefer_trait and prefer_trait in traits: score += 4 if "official" in traits: score += 2 if entry.get("is_primary"): score += 1 if score > best_score: best_score, best_title = score, title return best_title result: dict[str, str] = {} kanji = pick(("ja",), prefer_trait="native") or md.get("native_title") if kanji: result["jp"] = kanji romaji = pick(("ja-latn", "ja-romaji")) if not romaji: rt = md.get("romanized_title") or "" if rt and all(ord(c) < 128 for c in rt): romaji = rt if romaji: result["romaji"] = romaji en = pick(("en",)) or md.get("title") if en: result["en"] = en de = pick(("de",)) if de: result["de"] = de return result @staticmethod def _collect_all_alt_titles(md: dict) -> "dict[str, list[str]]": _GROUPS = { "en": ("en",), "de": ("de",), "ja": ("ja",), "ja-romaji": ("ja-latn", "ja-romaji"), "ko": ("ko",), "ko-romaji": ("ko-latn", "ko-romaji"), "zh": ("zh", "zh-hk", "zh-tw", "zh-hans", "zh-hant"), "zh-romaji": ("zh-latn",), } lang_to_group = {l: g for g, ls in _GROUPS.items() for l in ls} result: dict[str, list[str]] = {} seen: dict[str, set] = {} for entry in (md.get("titles") or md.get("alt_titles") or []): if not isinstance(entry, dict): continue lang = (entry.get("language") or entry.get("lang") or "").lower() group = lang_to_group.get(lang) if not group: continue title = (entry.get("title") or "").strip() if not title: continue result.setdefault(group, []) seen.setdefault(group, set()) if title not in seen[group]: result[group].append(title) seen[group].add(title) return result def _collect_web_links(self, md: dict) -> list[str]: links: list[str] = [l for l in (md.get("links") or []) if l] for raw_key, info in (md.get("source") or {}).items(): template = _TRACKER_URL_TEMPLATES.get(_normalise_key(raw_key)) if not template or not isinstance(info, dict): continue source_id = info.get("id") if source_id is not None: links.append(template.format(id=source_id)) seen: set[str] = set() unique: list[str] = [] for link in links: if link not in seen: seen.add(link) unique.append(link) return unique def _build_summary(self, md: dict, mal_stats: "dict | None") -> str: """Builds the HTML summary with stats table + description + alt titles.""" _TD = 'style="padding-right:1.5em"' parts: list[str] = [] if mal_stats: url = mal_stats.get("url", "") as_of = mal_stats.get("as_of", "") rows: list[str] = [] for label, key, fmt in ( ("Score", "score", "{}"), ("Ranked", "rank", "#{}"), ("Scored by", "scored_by", "{:,} users"), ("Popularity","popularity", "#{}"), ("Members", "members", "{:,}"), ("Favorites", "favorites", "{:,}"), ): v = mal_stats.get(key) if v is None: continue try: formatted = fmt.format(v) except (TypeError, ValueError): formatted = str(v) rows.append(f"{label}{formatted}") if rows: link = f'MyAnimeList' if url else "MyAnimeList" parts.append(f"

{link} stats as of {as_of}:

" f"{''.join(rows)}
") desc_raw = (md.get("description") or "").strip() if desc_raw: parts.append(_md_to_html(desc_raw)) all_alt = self._collect_all_alt_titles(md) if all_alt: label_map = { "en": "EN", "de": "DE", "ja": "JA", "ja-romaji": "JA Romaji", "ko": "KO", "ko-romaji": "KO Romaji", "zh": "ZH", "zh-romaji": "ZH Romaji", } alt_rows: list[str] = [] for group in ("en", "de", "ja", "ja-romaji", "ko", "ko-romaji", "zh", "zh-romaji"): titles = all_alt.get(group) if not titles: continue cell = "
".join(titles) alt_rows.append( f"{label_map[group]}{cell}") if alt_rows: parts.append(f"{''.join(alt_rows)}
") return "
".join(parts)