Files
manga-mover-and-metadata-co…/src/ln/LightNovelMetadataBuilder.py
T
johannesbot 216771f709
Build and Deploy / build (push) Successful in 59s
Build and Deploy / deploy (push) Successful in 24s
merged ln metadata into manga mover
2026-06-14 10:47:47 +02:00

572 lines
22 KiB
Python

"""
light_novel_metadata_builder.py
===============================
Fetches series-level metadata for a light novel from MangaBaka, enriches
it with MyAnimeList / AniList tracker statistics and character data, and
returns a structured dict ready to be diffed against Kavita's
SeriesMetadataDto.
Differences vs. the manga project's ComicInfoBuilder:
- No chapter / page handling — Kavita reads volumes from the files.
- No XML output — produces a plain dict.
- No MangaDex resolver — light novels don't have a chapter→volume
mapping problem.
- MangaBaka search type is fixed to ``novel`` so only light/web novels
are returned.
"""
from __future__ import annotations
import re
import requests
from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit
from MALResolver import MALResolver
from AniListResolver import AniListResolver
from MatchesCache import MatchesCache
from TextUtils import paragraphs_to_html, person_name_with_id
# MangaBaka series type for the search endpoint.
_SEARCH_TYPES = ["novel"]
# MangaBaka content_rating -> Kavita AgeRating enum
# Kavita AgeRating values (from openapi.json):
# 0=Unknown, 3=Everyone, 8=Teen, 10=Mature17Plus, 13=AdultsOnly
_AGE_RATING_MAP = {
"safe": 3, # Everyone
"suggestive": 8, # Teen
"erotica": 10, # Mature17Plus
"pornographic": 13, # AdultsOnly
}
# MangaBaka status -> Kavita PublicationStatus enum
# Kavita PublicationStatus (from openapi.json):
# 0=OnGoing, 1=Hiatus, 2=Completed, 3=Cancelled, 4=Ended
_PUB_STATUS_MAP = {
"ongoing": 0,
"hiatus": 1,
"completed": 2,
"cancelled": 3,
"ended": 4,
}
# External-tracker URL templates used to enrich the web-links list.
_TRACKER_URL_TEMPLATES = {
"anilist": "https://anilist.co/manga/{id}",
"myanimelist": "https://myanimelist.net/manga/{id}",
"mal": "https://myanimelist.net/manga/{id}",
"mangaupdates": "https://www.mangaupdates.com/series.html?id={id}",
"kitsu": "https://kitsu.app/manga/{id}",
"animenewsnetwork": "https://www.animenewsnetwork.com/encyclopedia/manga.php?id={id}",
"ann": "https://www.animenewsnetwork.com/encyclopedia/manga.php?id={id}",
"animeplanet": "https://www.anime-planet.com/manga/{id}",
"shikimori": "https://shikimori.one/mangas/{id}",
"bookwalker": "https://bookwalker.jp/{id}",
}
_MD_ESCAPE_RE = re.compile(r'\\([\\`*_{}\[\]()\#+\-.!|~])')
# --------------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------------
def _normalise_key(key) -> str:
return re.sub(r"[^a-z0-9]", "", str(key).lower())
def _format_term(value: str) -> str:
return str(value).replace("_", " ").strip().title() if value else ""
def _md_to_html(text: str) -> str:
"""Converts the subset of Markdown produced by MangaBaka to compact HTML."""
if not text:
return ""
text = _MD_ESCAPE_RE.sub(r'\1', text)
text = re.sub(
r'\[([^\]]+)\]\(([^)]+)\)',
lambda m: f'<a href="{m.group(2)}">{m.group(1)}</a>',
text,
)
text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text, flags=re.DOTALL)
text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text, flags=re.DOTALL)
return paragraphs_to_html(text)
def pick_cover_url(cover) -> "str | None":
"""Selects the best cover URL from a MangaBaka cover object."""
if not cover:
return None
if isinstance(cover, str):
return cover
if not isinstance(cover, dict):
return None
raw = cover.get("raw")
if isinstance(raw, dict):
url = raw.get("url")
if isinstance(url, str) and url:
return url
elif isinstance(raw, str) and raw:
return raw
for size_key in ("x350", "x250", "x150"):
variant = cover.get(size_key)
if isinstance(variant, dict):
for density in ("x3", "x2", "x1"):
url = variant.get(density)
if isinstance(url, str) and url:
return url
elif isinstance(variant, str) and variant:
return variant
for val in cover.values():
if isinstance(val, str) and val.startswith("http"):
return val
if isinstance(val, dict):
for sub in val.values():
if isinstance(sub, str) and sub.startswith("http"):
return sub
return None
def pick_thumbnail_url(cover) -> "str | None":
"""Picks a small cover variant suitable for a UI thumbnail."""
if not cover:
return None
if isinstance(cover, str):
return cover
if not isinstance(cover, dict):
return None
for size_key in ("x150", "x250", "x350"):
variant = cover.get(size_key)
if isinstance(variant, dict):
for density in ("x2", "x1", "x3"):
url = variant.get(density)
if isinstance(url, str) and url:
return url
elif isinstance(variant, str) and variant:
return variant
return pick_cover_url(cover)
def _id_from_source(md: dict, *names: str) -> "int | None":
target = {_normalise_key(n) for n in names}
for raw_key, info in (md.get("source") or {}).items():
if _normalise_key(raw_key) in target and isinstance(info, dict):
mid = info.get("id")
if mid is not None:
try:
return int(mid)
except (TypeError, ValueError):
pass
return None
# --------------------------------------------------------------------------
# Builder
# --------------------------------------------------------------------------
class LightNovelMetadataBuilder:
"""
Resolves a light-novel series on MangaBaka and produces a structured
metadata dict ready to be merged into Kavita.
"""
def __init__(self, *,
api_base_url: str = "https://api.mangabaka.dev/v1",
language: str = "en",
request_timeout: int = 30,
session: "requests.Session | None" = None,
mal_resolver: "MALResolver | None" = None,
al_resolver: "AniListResolver | None" = None,
matches_cache: "MatchesCache | None" = None):
self.api_base_url = api_base_url.rstrip("/")
self.language = language
self.request_timeout = request_timeout
self._session = session or requests.Session()
self._session.headers.setdefault("User-Agent",
"LightNovelMetadataBuilder/1.0")
_apply_mangabaka_rate_limit(self._session)
self._mal = mal_resolver or MALResolver(
request_timeout=request_timeout, search_type="lightnovel")
self._al = al_resolver or AniListResolver(
request_timeout=request_timeout, media_format="novel")
self._matches_cache = matches_cache
# ------------------------------------------------------------------
# MangaBaka search / fetch
# ------------------------------------------------------------------
def search_series(self, title: str) -> "dict | None":
"""Returns the top MangaBaka novel hit for `title`, or None."""
if not title or not title.strip():
return None
url = f"{self.api_base_url}/series/search"
try:
resp = self._session.get(
url, params={"q": title, "type": _SEARCH_TYPES,
"page": 1, "limit": 1},
timeout=self.request_timeout)
resp.raise_for_status()
except requests.RequestException:
return None
data = resp.json().get("data") or []
return data[0] if data else None
def fetch_series(self, series_id) -> "dict | None":
"""
Returns the full MangaBaka series dict for the given id, following
``merged_with`` redirects. A seen-set guards against merge cycles.
"""
if series_id is None or str(series_id).strip() == "":
return None
seen: set[str] = set()
current = series_id
while str(current) not in seen:
seen.add(str(current))
url = f"{self.api_base_url}/series/{current}"
resp = self._session.get(url, timeout=self.request_timeout)
resp.raise_for_status()
data = resp.json().get("data")
if data and data.get("state") == "merged" and data.get("merged_with"):
current = data["merged_with"]
continue
return data
return None
# ------------------------------------------------------------------
# Resolve title -> MangaBaka series (caches the match)
# ------------------------------------------------------------------
def resolve(self, title: str) -> "dict | None":
"""
Returns the MangaBaka series for `title`.
Lookup order:
1. MatchesCache (uses stored mangabakaId, skips the search).
2. Fresh MangaBaka search — top hit. Result is persisted to the
cache so it survives a crash.
"""
if self._matches_cache is not None:
cached = self._matches_cache.get(title)
if cached and cached.get("mangabakaId"):
try:
series = self.fetch_series(cached["mangabakaId"])
if series:
return series
except Exception:
pass
series = self.search_series(title)
if series and self._matches_cache is not None:
self._matches_cache.upsert(
title,
mangabaka_id=series.get("id"),
mangabaka_name=series.get("title") or "",
image_url=pick_thumbnail_url(series.get("cover")),
)
return series
# ------------------------------------------------------------------
# Main entry point
# ------------------------------------------------------------------
def build(self, *, title: str = "",
mangabaka_id=None) -> "dict | None":
"""
Fetches and enriches metadata for one series, returning the
normalised dict described in the module docstring.
Pass either `title` (will resolve via cache/search) or
`mangabaka_id` (direct fetch).
"""
if mangabaka_id is not None and str(mangabaka_id).strip():
md = self.fetch_series(mangabaka_id)
else:
md = self.resolve(title)
if not md:
return None
return self._assemble(md)
# ------------------------------------------------------------------
# Internal: assemble the result dict
# ------------------------------------------------------------------
def _assemble(self, md: dict) -> dict:
mal_id = _id_from_source(md, "myanimelist", "mal")
al_id = _id_from_source(md, "anilist")
# Fall back to a title-based MAL lookup when the source map does
# not carry an id — Jikan is the only tracker that ships staff
# data we can use to enrich author / artist person records.
if mal_id is None:
mal_id = self._mal.find_mal_id(md.get("title") or "")
mal_stats = self._mal.get_stats(mal_id) if mal_id else None
characters_detailed = self._mal.get_characters_detailed(mal_id) if mal_id else []
if not characters_detailed and al_id:
characters_detailed = self._al.get_characters_detailed(al_id)
staff_detailed = self._mal.get_staff_detailed(mal_id) if mal_id else []
if not staff_detailed and al_id:
staff_detailed = self._al.get_staff_detailed(al_id)
# Character names for SeriesMetadata, disambiguated with the
# tracker character id ("Rem (MAL 118737)") because Kavita person
# records are global and keyed by name only.
character_names = [
person_name_with_id(c["name"],
mal_id=c.get("mal_id"),
al_id=c.get("al_id"))
for c in characters_detailed if c.get("name")
]
# Writers come from MangaBaka first (authoritative for novels)
writers = list(md.get("authors") or [])
# Illustrators / artists -> CoverArtists (Kavita has no dedicated
# illustrator field, and Pencillers is the wrong semantic for
# text-only novels).
cover_artists = list(md.get("artists") or [])
# Publisher: prefer English licence, else original. When both
# exist, the original publisher becomes the imprint.
english_pubs = self._publishers_by_type(md, "English")
original_pubs = self._publishers_by_type(md, "Original")
publishers = english_pubs or original_pubs
imprint = original_pubs[0] if english_pubs and original_pubs else None
# Release year
release_year = None
try:
if md.get("year") is not None:
release_year = int(md["year"])
except (TypeError, ValueError):
pass
# Score: MangaBaka rating is 0..100 -> Kavita userRating is 0..5
score = None
if md.get("rating") is not None:
try:
score = round(float(md["rating"]) / 20.0, 1)
except (TypeError, ValueError):
pass
# Tags / genres come back as snake_case slugs.
genres = [_format_term(g) for g in (md.get("genres") or []) if g]
tags = [_format_term(t) for t in (md.get("tags") or []) if t]
# Web links
web_links = self._collect_web_links(md)
# Summary HTML
summary = self._build_summary(md, mal_stats)
# Cover URL
cover_url = pick_cover_url(md.get("cover"))
# Title variants
all_alt = self._collect_all_alt_titles(md)
return {
"mangabakaId": str(md.get("id") or ""),
"mangabakaTitle": md.get("title") or "",
"originalName": md.get("native_title") or "",
"localizedName": md.get("romanized_title") or "",
"sortName": self._sort_title(md),
"altTitles": all_alt,
"summary": summary,
"genres": genres,
"tags": tags,
"characters": character_names,
"writers": writers,
"coverArtists": cover_artists,
"publishers": publishers,
"imprint": imprint,
"releaseYear": release_year,
"ageRating": _AGE_RATING_MAP.get(md.get("content_rating"), 0),
"publicationStatus": _PUB_STATUS_MAP.get(
(md.get("status") or "").lower(), 0),
"language": self.language,
"webLinks": web_links,
"score": score,
"coverUrl": cover_url,
"malId": mal_id,
"anilistId": al_id,
"relationships": list(md.get("relationships_v2") or []),
"charactersDetailed": characters_detailed,
"staffDetailed": staff_detailed,
"raw": md,
}
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
@staticmethod
def _publishers_by_type(md: dict, ptype: str) -> list[str]:
return [p.get("name") for p in (md.get("publishers") or [])
if p.get("type") == ptype and p.get("name")]
def _sort_title(self, md: dict) -> str:
lang = self.language.lower()
alts = self._collect_alt_titles(md)
return alts.get(lang) or md.get("title") or ""
def _collect_alt_titles(self, md: dict) -> "dict[str, str]":
"""Returns one best title per language code (en/de/jp/romaji)."""
titles = md.get("titles") or md.get("alt_titles") or []
def pick(language_codes: tuple, prefer_trait: "str | None" = None
) -> "str | None":
best_score = -1
best_title: "str | None" = None
for entry in titles:
if not isinstance(entry, dict):
continue
lang = (entry.get("language") or entry.get("lang") or "").lower()
if lang not in language_codes:
continue
title = entry.get("title")
if not title:
continue
traits = entry.get("traits") or []
score = 0
if prefer_trait and prefer_trait in traits:
score += 4
if "official" in traits:
score += 2
if entry.get("is_primary"):
score += 1
if score > best_score:
best_score, best_title = score, title
return best_title
result: dict[str, str] = {}
kanji = pick(("ja",), prefer_trait="native") or md.get("native_title")
if kanji:
result["jp"] = kanji
romaji = pick(("ja-latn", "ja-romaji"))
if not romaji:
rt = md.get("romanized_title") or ""
if rt and all(ord(c) < 128 for c in rt):
romaji = rt
if romaji:
result["romaji"] = romaji
en = pick(("en",)) or md.get("title")
if en:
result["en"] = en
de = pick(("de",))
if de:
result["de"] = de
return result
@staticmethod
def _collect_all_alt_titles(md: dict) -> "dict[str, list[str]]":
_GROUPS = {
"en": ("en",),
"de": ("de",),
"ja": ("ja",),
"ja-romaji": ("ja-latn", "ja-romaji"),
"ko": ("ko",),
"ko-romaji": ("ko-latn", "ko-romaji"),
"zh": ("zh", "zh-hk", "zh-tw", "zh-hans", "zh-hant"),
"zh-romaji": ("zh-latn",),
}
lang_to_group = {l: g for g, ls in _GROUPS.items() for l in ls}
result: dict[str, list[str]] = {}
seen: dict[str, set] = {}
for entry in (md.get("titles") or md.get("alt_titles") or []):
if not isinstance(entry, dict):
continue
lang = (entry.get("language") or entry.get("lang") or "").lower()
group = lang_to_group.get(lang)
if not group:
continue
title = (entry.get("title") or "").strip()
if not title:
continue
result.setdefault(group, [])
seen.setdefault(group, set())
if title not in seen[group]:
result[group].append(title)
seen[group].add(title)
return result
def _collect_web_links(self, md: dict) -> list[str]:
links: list[str] = [l for l in (md.get("links") or []) if l]
for raw_key, info in (md.get("source") or {}).items():
template = _TRACKER_URL_TEMPLATES.get(_normalise_key(raw_key))
if not template or not isinstance(info, dict):
continue
source_id = info.get("id")
if source_id is not None:
links.append(template.format(id=source_id))
seen: set[str] = set()
unique: list[str] = []
for link in links:
if link not in seen:
seen.add(link)
unique.append(link)
return unique
def _build_summary(self, md: dict,
mal_stats: "dict | None") -> str:
"""Builds the HTML summary with stats table + description + alt titles."""
_TD = 'style="padding-right:1.5em"'
parts: list[str] = []
if mal_stats:
url = mal_stats.get("url", "")
as_of = mal_stats.get("as_of", "")
rows: list[str] = []
for label, key, fmt in (
("Score", "score", "{}"),
("Ranked", "rank", "#{}"),
("Scored by", "scored_by", "{:,} users"),
("Popularity","popularity", "#{}"),
("Members", "members", "{:,}"),
("Favorites", "favorites", "{:,}"),
):
v = mal_stats.get(key)
if v is None:
continue
try:
formatted = fmt.format(v)
except (TypeError, ValueError):
formatted = str(v)
rows.append(f"<tr><td {_TD}>{label}</td><td>{formatted}</td></tr>")
if rows:
link = f'<a href="{url}" target="_blank">MyAnimeList</a>' if url else "MyAnimeList"
parts.append(f"<p>{link} stats as of {as_of}:</p>"
f"<table>{''.join(rows)}</table>")
desc_raw = (md.get("description") or "").strip()
if desc_raw:
parts.append(_md_to_html(desc_raw))
all_alt = self._collect_all_alt_titles(md)
if all_alt:
label_map = {
"en": "EN",
"de": "DE",
"ja": "JA",
"ja-romaji": "JA Romaji",
"ko": "KO",
"ko-romaji": "KO Romaji",
"zh": "ZH",
"zh-romaji": "ZH Romaji",
}
alt_rows: list[str] = []
for group in ("en", "de", "ja", "ja-romaji",
"ko", "ko-romaji", "zh", "zh-romaji"):
titles = all_alt.get(group)
if not titles:
continue
cell = "<br>".join(titles)
alt_rows.append(
f"<tr><td {_TD}>{label_map[group]}</td><td>{cell}</td></tr>")
if alt_rows:
parts.append(f"<table>{''.join(alt_rows)}</table>")
return "<br>".join(parts)