572 lines
22 KiB
Python
572 lines
22 KiB
Python
"""
|
|
light_novel_metadata_builder.py
|
|
===============================
|
|
|
|
Fetches series-level metadata for a light novel from MangaBaka, enriches
|
|
it with MyAnimeList / AniList tracker statistics and character data, and
|
|
returns a structured dict ready to be diffed against Kavita's
|
|
SeriesMetadataDto.
|
|
|
|
Differences vs. the manga project's ComicInfoBuilder:
|
|
- No chapter / page handling — Kavita reads volumes from the files.
|
|
- No XML output — produces a plain dict.
|
|
- No MangaDex resolver — light novels don't have a chapter→volume
|
|
mapping problem.
|
|
- MangaBaka search type is fixed to ``novel`` so only light/web novels
|
|
are returned.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
import requests
|
|
|
|
from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit
|
|
from MALResolver import MALResolver
|
|
from AniListResolver import AniListResolver
|
|
from MatchesCache import MatchesCache
|
|
from TextUtils import paragraphs_to_html, person_name_with_id
|
|
|
|
|
|
# MangaBaka series type for the search endpoint.
|
|
_SEARCH_TYPES = ["novel"]
|
|
|
|
# MangaBaka content_rating -> Kavita AgeRating enum
|
|
# Kavita AgeRating values (from openapi.json):
|
|
# 0=Unknown, 3=Everyone, 8=Teen, 10=Mature17Plus, 13=AdultsOnly
|
|
_AGE_RATING_MAP = {
|
|
"safe": 3, # Everyone
|
|
"suggestive": 8, # Teen
|
|
"erotica": 10, # Mature17Plus
|
|
"pornographic": 13, # AdultsOnly
|
|
}
|
|
|
|
# MangaBaka status -> Kavita PublicationStatus enum
|
|
# Kavita PublicationStatus (from openapi.json):
|
|
# 0=OnGoing, 1=Hiatus, 2=Completed, 3=Cancelled, 4=Ended
|
|
_PUB_STATUS_MAP = {
|
|
"ongoing": 0,
|
|
"hiatus": 1,
|
|
"completed": 2,
|
|
"cancelled": 3,
|
|
"ended": 4,
|
|
}
|
|
|
|
# External-tracker URL templates used to enrich the web-links list.
|
|
_TRACKER_URL_TEMPLATES = {
|
|
"anilist": "https://anilist.co/manga/{id}",
|
|
"myanimelist": "https://myanimelist.net/manga/{id}",
|
|
"mal": "https://myanimelist.net/manga/{id}",
|
|
"mangaupdates": "https://www.mangaupdates.com/series.html?id={id}",
|
|
"kitsu": "https://kitsu.app/manga/{id}",
|
|
"animenewsnetwork": "https://www.animenewsnetwork.com/encyclopedia/manga.php?id={id}",
|
|
"ann": "https://www.animenewsnetwork.com/encyclopedia/manga.php?id={id}",
|
|
"animeplanet": "https://www.anime-planet.com/manga/{id}",
|
|
"shikimori": "https://shikimori.one/mangas/{id}",
|
|
"bookwalker": "https://bookwalker.jp/{id}",
|
|
}
|
|
|
|
_MD_ESCAPE_RE = re.compile(r'\\([\\`*_{}\[\]()\#+\-.!|~])')
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Helpers
|
|
# --------------------------------------------------------------------------
|
|
def _normalise_key(key) -> str:
|
|
return re.sub(r"[^a-z0-9]", "", str(key).lower())
|
|
|
|
|
|
def _format_term(value: str) -> str:
|
|
return str(value).replace("_", " ").strip().title() if value else ""
|
|
|
|
|
|
def _md_to_html(text: str) -> str:
|
|
"""Converts the subset of Markdown produced by MangaBaka to compact HTML."""
|
|
if not text:
|
|
return ""
|
|
text = _MD_ESCAPE_RE.sub(r'\1', text)
|
|
text = re.sub(
|
|
r'\[([^\]]+)\]\(([^)]+)\)',
|
|
lambda m: f'<a href="{m.group(2)}">{m.group(1)}</a>',
|
|
text,
|
|
)
|
|
text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text, flags=re.DOTALL)
|
|
text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text, flags=re.DOTALL)
|
|
return paragraphs_to_html(text)
|
|
|
|
|
|
def pick_cover_url(cover) -> "str | None":
|
|
"""Selects the best cover URL from a MangaBaka cover object."""
|
|
if not cover:
|
|
return None
|
|
if isinstance(cover, str):
|
|
return cover
|
|
if not isinstance(cover, dict):
|
|
return None
|
|
|
|
raw = cover.get("raw")
|
|
if isinstance(raw, dict):
|
|
url = raw.get("url")
|
|
if isinstance(url, str) and url:
|
|
return url
|
|
elif isinstance(raw, str) and raw:
|
|
return raw
|
|
|
|
for size_key in ("x350", "x250", "x150"):
|
|
variant = cover.get(size_key)
|
|
if isinstance(variant, dict):
|
|
for density in ("x3", "x2", "x1"):
|
|
url = variant.get(density)
|
|
if isinstance(url, str) and url:
|
|
return url
|
|
elif isinstance(variant, str) and variant:
|
|
return variant
|
|
|
|
for val in cover.values():
|
|
if isinstance(val, str) and val.startswith("http"):
|
|
return val
|
|
if isinstance(val, dict):
|
|
for sub in val.values():
|
|
if isinstance(sub, str) and sub.startswith("http"):
|
|
return sub
|
|
return None
|
|
|
|
|
|
def pick_thumbnail_url(cover) -> "str | None":
|
|
"""Picks a small cover variant suitable for a UI thumbnail."""
|
|
if not cover:
|
|
return None
|
|
if isinstance(cover, str):
|
|
return cover
|
|
if not isinstance(cover, dict):
|
|
return None
|
|
for size_key in ("x150", "x250", "x350"):
|
|
variant = cover.get(size_key)
|
|
if isinstance(variant, dict):
|
|
for density in ("x2", "x1", "x3"):
|
|
url = variant.get(density)
|
|
if isinstance(url, str) and url:
|
|
return url
|
|
elif isinstance(variant, str) and variant:
|
|
return variant
|
|
return pick_cover_url(cover)
|
|
|
|
|
|
def _id_from_source(md: dict, *names: str) -> "int | None":
|
|
target = {_normalise_key(n) for n in names}
|
|
for raw_key, info in (md.get("source") or {}).items():
|
|
if _normalise_key(raw_key) in target and isinstance(info, dict):
|
|
mid = info.get("id")
|
|
if mid is not None:
|
|
try:
|
|
return int(mid)
|
|
except (TypeError, ValueError):
|
|
pass
|
|
return None
|
|
|
|
|
|
# --------------------------------------------------------------------------
|
|
# Builder
|
|
# --------------------------------------------------------------------------
|
|
class LightNovelMetadataBuilder:
|
|
"""
|
|
Resolves a light-novel series on MangaBaka and produces a structured
|
|
metadata dict ready to be merged into Kavita.
|
|
"""
|
|
|
|
def __init__(self, *,
|
|
api_base_url: str = "https://api.mangabaka.dev/v1",
|
|
language: str = "en",
|
|
request_timeout: int = 30,
|
|
session: "requests.Session | None" = None,
|
|
mal_resolver: "MALResolver | None" = None,
|
|
al_resolver: "AniListResolver | None" = None,
|
|
matches_cache: "MatchesCache | None" = None):
|
|
self.api_base_url = api_base_url.rstrip("/")
|
|
self.language = language
|
|
self.request_timeout = request_timeout
|
|
|
|
self._session = session or requests.Session()
|
|
self._session.headers.setdefault("User-Agent",
|
|
"LightNovelMetadataBuilder/1.0")
|
|
_apply_mangabaka_rate_limit(self._session)
|
|
|
|
self._mal = mal_resolver or MALResolver(
|
|
request_timeout=request_timeout, search_type="lightnovel")
|
|
self._al = al_resolver or AniListResolver(
|
|
request_timeout=request_timeout, media_format="novel")
|
|
self._matches_cache = matches_cache
|
|
|
|
# ------------------------------------------------------------------
|
|
# MangaBaka search / fetch
|
|
# ------------------------------------------------------------------
|
|
def search_series(self, title: str) -> "dict | None":
|
|
"""Returns the top MangaBaka novel hit for `title`, or None."""
|
|
if not title or not title.strip():
|
|
return None
|
|
url = f"{self.api_base_url}/series/search"
|
|
try:
|
|
resp = self._session.get(
|
|
url, params={"q": title, "type": _SEARCH_TYPES,
|
|
"page": 1, "limit": 1},
|
|
timeout=self.request_timeout)
|
|
resp.raise_for_status()
|
|
except requests.RequestException:
|
|
return None
|
|
data = resp.json().get("data") or []
|
|
return data[0] if data else None
|
|
|
|
def fetch_series(self, series_id) -> "dict | None":
|
|
"""
|
|
Returns the full MangaBaka series dict for the given id, following
|
|
``merged_with`` redirects. A seen-set guards against merge cycles.
|
|
"""
|
|
if series_id is None or str(series_id).strip() == "":
|
|
return None
|
|
seen: set[str] = set()
|
|
current = series_id
|
|
while str(current) not in seen:
|
|
seen.add(str(current))
|
|
url = f"{self.api_base_url}/series/{current}"
|
|
resp = self._session.get(url, timeout=self.request_timeout)
|
|
resp.raise_for_status()
|
|
data = resp.json().get("data")
|
|
if data and data.get("state") == "merged" and data.get("merged_with"):
|
|
current = data["merged_with"]
|
|
continue
|
|
return data
|
|
return None
|
|
|
|
# ------------------------------------------------------------------
|
|
# Resolve title -> MangaBaka series (caches the match)
|
|
# ------------------------------------------------------------------
|
|
def resolve(self, title: str) -> "dict | None":
|
|
"""
|
|
Returns the MangaBaka series for `title`.
|
|
|
|
Lookup order:
|
|
1. MatchesCache (uses stored mangabakaId, skips the search).
|
|
2. Fresh MangaBaka search — top hit. Result is persisted to the
|
|
cache so it survives a crash.
|
|
"""
|
|
if self._matches_cache is not None:
|
|
cached = self._matches_cache.get(title)
|
|
if cached and cached.get("mangabakaId"):
|
|
try:
|
|
series = self.fetch_series(cached["mangabakaId"])
|
|
if series:
|
|
return series
|
|
except Exception:
|
|
pass
|
|
|
|
series = self.search_series(title)
|
|
if series and self._matches_cache is not None:
|
|
self._matches_cache.upsert(
|
|
title,
|
|
mangabaka_id=series.get("id"),
|
|
mangabaka_name=series.get("title") or "",
|
|
image_url=pick_thumbnail_url(series.get("cover")),
|
|
)
|
|
return series
|
|
|
|
# ------------------------------------------------------------------
|
|
# Main entry point
|
|
# ------------------------------------------------------------------
|
|
def build(self, *, title: str = "",
|
|
mangabaka_id=None) -> "dict | None":
|
|
"""
|
|
Fetches and enriches metadata for one series, returning the
|
|
normalised dict described in the module docstring.
|
|
|
|
Pass either `title` (will resolve via cache/search) or
|
|
`mangabaka_id` (direct fetch).
|
|
"""
|
|
if mangabaka_id is not None and str(mangabaka_id).strip():
|
|
md = self.fetch_series(mangabaka_id)
|
|
else:
|
|
md = self.resolve(title)
|
|
if not md:
|
|
return None
|
|
return self._assemble(md)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Internal: assemble the result dict
|
|
# ------------------------------------------------------------------
|
|
def _assemble(self, md: dict) -> dict:
|
|
mal_id = _id_from_source(md, "myanimelist", "mal")
|
|
al_id = _id_from_source(md, "anilist")
|
|
|
|
# Fall back to a title-based MAL lookup when the source map does
|
|
# not carry an id — Jikan is the only tracker that ships staff
|
|
# data we can use to enrich author / artist person records.
|
|
if mal_id is None:
|
|
mal_id = self._mal.find_mal_id(md.get("title") or "")
|
|
|
|
mal_stats = self._mal.get_stats(mal_id) if mal_id else None
|
|
|
|
characters_detailed = self._mal.get_characters_detailed(mal_id) if mal_id else []
|
|
if not characters_detailed and al_id:
|
|
characters_detailed = self._al.get_characters_detailed(al_id)
|
|
|
|
staff_detailed = self._mal.get_staff_detailed(mal_id) if mal_id else []
|
|
if not staff_detailed and al_id:
|
|
staff_detailed = self._al.get_staff_detailed(al_id)
|
|
|
|
# Character names for SeriesMetadata, disambiguated with the
|
|
# tracker character id ("Rem (MAL 118737)") because Kavita person
|
|
# records are global and keyed by name only.
|
|
character_names = [
|
|
person_name_with_id(c["name"],
|
|
mal_id=c.get("mal_id"),
|
|
al_id=c.get("al_id"))
|
|
for c in characters_detailed if c.get("name")
|
|
]
|
|
# Writers come from MangaBaka first (authoritative for novels)
|
|
writers = list(md.get("authors") or [])
|
|
# Illustrators / artists -> CoverArtists (Kavita has no dedicated
|
|
# illustrator field, and Pencillers is the wrong semantic for
|
|
# text-only novels).
|
|
cover_artists = list(md.get("artists") or [])
|
|
|
|
# Publisher: prefer English licence, else original. When both
|
|
# exist, the original publisher becomes the imprint.
|
|
english_pubs = self._publishers_by_type(md, "English")
|
|
original_pubs = self._publishers_by_type(md, "Original")
|
|
publishers = english_pubs or original_pubs
|
|
imprint = original_pubs[0] if english_pubs and original_pubs else None
|
|
|
|
# Release year
|
|
release_year = None
|
|
try:
|
|
if md.get("year") is not None:
|
|
release_year = int(md["year"])
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
# Score: MangaBaka rating is 0..100 -> Kavita userRating is 0..5
|
|
score = None
|
|
if md.get("rating") is not None:
|
|
try:
|
|
score = round(float(md["rating"]) / 20.0, 1)
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
# Tags / genres come back as snake_case slugs.
|
|
genres = [_format_term(g) for g in (md.get("genres") or []) if g]
|
|
tags = [_format_term(t) for t in (md.get("tags") or []) if t]
|
|
|
|
# Web links
|
|
web_links = self._collect_web_links(md)
|
|
|
|
# Summary HTML
|
|
summary = self._build_summary(md, mal_stats)
|
|
|
|
# Cover URL
|
|
cover_url = pick_cover_url(md.get("cover"))
|
|
|
|
# Title variants
|
|
all_alt = self._collect_all_alt_titles(md)
|
|
|
|
return {
|
|
"mangabakaId": str(md.get("id") or ""),
|
|
"mangabakaTitle": md.get("title") or "",
|
|
"originalName": md.get("native_title") or "",
|
|
"localizedName": md.get("romanized_title") or "",
|
|
"sortName": self._sort_title(md),
|
|
"altTitles": all_alt,
|
|
"summary": summary,
|
|
"genres": genres,
|
|
"tags": tags,
|
|
"characters": character_names,
|
|
"writers": writers,
|
|
"coverArtists": cover_artists,
|
|
"publishers": publishers,
|
|
"imprint": imprint,
|
|
"releaseYear": release_year,
|
|
"ageRating": _AGE_RATING_MAP.get(md.get("content_rating"), 0),
|
|
"publicationStatus": _PUB_STATUS_MAP.get(
|
|
(md.get("status") or "").lower(), 0),
|
|
"language": self.language,
|
|
"webLinks": web_links,
|
|
"score": score,
|
|
"coverUrl": cover_url,
|
|
"malId": mal_id,
|
|
"anilistId": al_id,
|
|
"relationships": list(md.get("relationships_v2") or []),
|
|
"charactersDetailed": characters_detailed,
|
|
"staffDetailed": staff_detailed,
|
|
"raw": md,
|
|
}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Internal helpers
|
|
# ------------------------------------------------------------------
|
|
@staticmethod
|
|
def _publishers_by_type(md: dict, ptype: str) -> list[str]:
|
|
return [p.get("name") for p in (md.get("publishers") or [])
|
|
if p.get("type") == ptype and p.get("name")]
|
|
|
|
def _sort_title(self, md: dict) -> str:
|
|
lang = self.language.lower()
|
|
alts = self._collect_alt_titles(md)
|
|
return alts.get(lang) or md.get("title") or ""
|
|
|
|
def _collect_alt_titles(self, md: dict) -> "dict[str, str]":
|
|
"""Returns one best title per language code (en/de/jp/romaji)."""
|
|
titles = md.get("titles") or md.get("alt_titles") or []
|
|
|
|
def pick(language_codes: tuple, prefer_trait: "str | None" = None
|
|
) -> "str | None":
|
|
best_score = -1
|
|
best_title: "str | None" = None
|
|
for entry in titles:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
lang = (entry.get("language") or entry.get("lang") or "").lower()
|
|
if lang not in language_codes:
|
|
continue
|
|
title = entry.get("title")
|
|
if not title:
|
|
continue
|
|
traits = entry.get("traits") or []
|
|
score = 0
|
|
if prefer_trait and prefer_trait in traits:
|
|
score += 4
|
|
if "official" in traits:
|
|
score += 2
|
|
if entry.get("is_primary"):
|
|
score += 1
|
|
if score > best_score:
|
|
best_score, best_title = score, title
|
|
return best_title
|
|
|
|
result: dict[str, str] = {}
|
|
kanji = pick(("ja",), prefer_trait="native") or md.get("native_title")
|
|
if kanji:
|
|
result["jp"] = kanji
|
|
romaji = pick(("ja-latn", "ja-romaji"))
|
|
if not romaji:
|
|
rt = md.get("romanized_title") or ""
|
|
if rt and all(ord(c) < 128 for c in rt):
|
|
romaji = rt
|
|
if romaji:
|
|
result["romaji"] = romaji
|
|
en = pick(("en",)) or md.get("title")
|
|
if en:
|
|
result["en"] = en
|
|
de = pick(("de",))
|
|
if de:
|
|
result["de"] = de
|
|
return result
|
|
|
|
@staticmethod
|
|
def _collect_all_alt_titles(md: dict) -> "dict[str, list[str]]":
|
|
_GROUPS = {
|
|
"en": ("en",),
|
|
"de": ("de",),
|
|
"ja": ("ja",),
|
|
"ja-romaji": ("ja-latn", "ja-romaji"),
|
|
"ko": ("ko",),
|
|
"ko-romaji": ("ko-latn", "ko-romaji"),
|
|
"zh": ("zh", "zh-hk", "zh-tw", "zh-hans", "zh-hant"),
|
|
"zh-romaji": ("zh-latn",),
|
|
}
|
|
lang_to_group = {l: g for g, ls in _GROUPS.items() for l in ls}
|
|
result: dict[str, list[str]] = {}
|
|
seen: dict[str, set] = {}
|
|
for entry in (md.get("titles") or md.get("alt_titles") or []):
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
lang = (entry.get("language") or entry.get("lang") or "").lower()
|
|
group = lang_to_group.get(lang)
|
|
if not group:
|
|
continue
|
|
title = (entry.get("title") or "").strip()
|
|
if not title:
|
|
continue
|
|
result.setdefault(group, [])
|
|
seen.setdefault(group, set())
|
|
if title not in seen[group]:
|
|
result[group].append(title)
|
|
seen[group].add(title)
|
|
return result
|
|
|
|
def _collect_web_links(self, md: dict) -> list[str]:
|
|
links: list[str] = [l for l in (md.get("links") or []) if l]
|
|
for raw_key, info in (md.get("source") or {}).items():
|
|
template = _TRACKER_URL_TEMPLATES.get(_normalise_key(raw_key))
|
|
if not template or not isinstance(info, dict):
|
|
continue
|
|
source_id = info.get("id")
|
|
if source_id is not None:
|
|
links.append(template.format(id=source_id))
|
|
seen: set[str] = set()
|
|
unique: list[str] = []
|
|
for link in links:
|
|
if link not in seen:
|
|
seen.add(link)
|
|
unique.append(link)
|
|
return unique
|
|
|
|
def _build_summary(self, md: dict,
|
|
mal_stats: "dict | None") -> str:
|
|
"""Builds the HTML summary with stats table + description + alt titles."""
|
|
_TD = 'style="padding-right:1.5em"'
|
|
parts: list[str] = []
|
|
|
|
if mal_stats:
|
|
url = mal_stats.get("url", "")
|
|
as_of = mal_stats.get("as_of", "")
|
|
rows: list[str] = []
|
|
for label, key, fmt in (
|
|
("Score", "score", "{}"),
|
|
("Ranked", "rank", "#{}"),
|
|
("Scored by", "scored_by", "{:,} users"),
|
|
("Popularity","popularity", "#{}"),
|
|
("Members", "members", "{:,}"),
|
|
("Favorites", "favorites", "{:,}"),
|
|
):
|
|
v = mal_stats.get(key)
|
|
if v is None:
|
|
continue
|
|
try:
|
|
formatted = fmt.format(v)
|
|
except (TypeError, ValueError):
|
|
formatted = str(v)
|
|
rows.append(f"<tr><td {_TD}>{label}</td><td>{formatted}</td></tr>")
|
|
if rows:
|
|
link = f'<a href="{url}" target="_blank">MyAnimeList</a>' if url else "MyAnimeList"
|
|
parts.append(f"<p>{link} stats as of {as_of}:</p>"
|
|
f"<table>{''.join(rows)}</table>")
|
|
|
|
desc_raw = (md.get("description") or "").strip()
|
|
if desc_raw:
|
|
parts.append(_md_to_html(desc_raw))
|
|
|
|
all_alt = self._collect_all_alt_titles(md)
|
|
if all_alt:
|
|
label_map = {
|
|
"en": "EN",
|
|
"de": "DE",
|
|
"ja": "JA",
|
|
"ja-romaji": "JA Romaji",
|
|
"ko": "KO",
|
|
"ko-romaji": "KO Romaji",
|
|
"zh": "ZH",
|
|
"zh-romaji": "ZH Romaji",
|
|
}
|
|
alt_rows: list[str] = []
|
|
for group in ("en", "de", "ja", "ja-romaji",
|
|
"ko", "ko-romaji", "zh", "zh-romaji"):
|
|
titles = all_alt.get(group)
|
|
if not titles:
|
|
continue
|
|
cell = "<br>".join(titles)
|
|
alt_rows.append(
|
|
f"<tr><td {_TD}>{label_map[group]}</td><td>{cell}</td></tr>")
|
|
if alt_rows:
|
|
parts.append(f"<table>{''.join(alt_rows)}</table>")
|
|
|
|
return "<br>".join(parts)
|