Files
manga-mover-and-metadata-co…/src/ComicInfoBuilder.py
T
2026-05-26 21:03:37 +02:00

1083 lines
43 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
comicinfo_builder.py
====================
Generates a ComicInfo.xml (compatible with Kavita v0.9.0.2 / ComicInfo v2.1)
from series metadata provided by the MangaBaka API, enriched with data from
MangaDex (volume mapping), MangaBaka works (volume covers / ISBN / dates),
and MyAnimeList / Jikan (statistics and characters).
Dependencies
------------
requests (required -> API calls / cover download)
Pillow (PIL) (optional -> image dimensions for <Page> entries)
pip install requests pillow
The modules MangadexVolumeResolver, MangaBakaWorksResolver and
MALResolver must reside in the same directory.
API address note
----------------
The official MangaBaka API is hosted at https://api.mangabaka.dev/v1
(domain ".dev", not ".org"). Use the `api_base_url` constructor parameter
to override this if needed.
Data source notes
-----------------
* Volume assignment per chapter is resolved via MangaDex
(MangaDexVolumeResolver). Chapters missing from MangaDex are estimated
from neighbouring volume boundaries and MangaBaka page-count data.
* Volume-specific covers, ISBNs and publication dates come from MangaBaka
works (MangaBakaWorksResolver). If no volume is assigned the series
cover is used instead.
* MAL statistics and character names are fetched via the Jikan API
(MALResolver).
"""
from __future__ import annotations
import difflib
import re
import xml.etree.ElementTree as ET
from pathlib import Path
import requests
from MangadexVolumeResolver import MangaDexVolumeResolver
from MangaBakaWorksResolver import MangaBakaWorksResolver
from MALResolver import MALResolver
from AniListResolver import AniListResolver
from MatchesCache import MatchesCache
from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit
try:
from PIL import Image
_HAS_PIL = True
except ImportError:
_HAS_PIL = False
# --------------------------------------------------------------------------
# Constants
# --------------------------------------------------------------------------
_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".avif"}
# Series types accepted by the MangaBaka search endpoint. Light/web novels
# are filtered out because this pipeline only handles image-based manga.
# Passed to `requests` as a list so each value becomes its own `&type=...`
# query parameter (MangaBaka's API expects repeated keys, not a CSV list).
_SEARCH_TYPES = ["manga", "manhwa", "manhua"]
_AGE_RATING_MAP = {
"safe": "Everyone",
"suggestive": "Teen",
"erotica": "Mature 17+",
"pornographic": "Adults Only 18+",
}
_TRACKER_URL_TEMPLATES = {
# Keys are normalised via _normalise_key (alphanumeric only, lowercase),
# so e.g. the source key "anime_news_network" matches "animenewsnetwork".
"anilist": "https://anilist.co/manga/{id}",
"myanimelist": "https://myanimelist.net/manga/{id}",
"mal": "https://myanimelist.net/manga/{id}",
"mangaupdates": "https://www.mangaupdates.com/series.html?id={id}",
"mangadex": "https://mangadex.org/title/{id}",
"kitsu": "https://kitsu.app/manga/{id}",
"animenewsnetwork": "https://www.animenewsnetwork.com/encyclopedia/manga.php?id={id}",
"ann": "https://www.animenewsnetwork.com/encyclopedia/manga.php?id={id}",
"animeplanet": "https://www.anime-planet.com/manga/{id}",
"shikimori": "https://shikimori.one/mangas/{id}",
}
# MangaDex relationship types that indicate child works (spin-offs, sequels …)
_CHILD_RELATION_TYPES = {"side_story", "spin_off", "sequel", "prequel",
"doujinshi", "adapted_from", "alternative_story",
"alternative_version"}
# --------------------------------------------------------------------------
# Module helpers
# --------------------------------------------------------------------------
def _natural_key(name: str):
return [int(p) if p.isdigit() else p.lower()
for p in re.split(r"(\d+)", name)]
def _normalise_key(key) -> str:
return re.sub(r"[^a-z0-9]", "", str(key).lower())
def _format_term(value: str) -> str:
"""Converts a MangaBaka genre slug ('slice_of_life') to display form."""
return str(value).replace("_", " ").strip().title() if value else ""
# Markdown backslash escape sequences recognised by CommonMark (e.g. \- → -)
_MD_ESCAPE_RE = re.compile(r'\\([\\`*_{}\[\]()\#+\-.!|~])')
def _md_to_html(text: str) -> str:
"""
Converts a subset of Markdown (as produced by MangaBaka) to HTML.
Handles: backslash escapes, [text](url) links, **bold**, *italic*,
blank-line paragraph splits, and single-newline line breaks.
Produces compact HTML with no raw newline characters — Kavita renders
every bare \\n as a <br>, so all line-breaks must be explicit.
"""
if not text:
return ""
# Unescape Markdown backslash sequences (\- → -, \* → *, …)
text = _MD_ESCAPE_RE.sub(r'\1', text)
# [text](url) → <a href="url">text</a>
text = re.sub(
r'\[([^\]]+)\]\(([^)]+)\)',
lambda m: f'<a href="{m.group(2)}">{m.group(1)}</a>',
text,
)
# **bold** before *italic* so ** is not mistaken for two *
text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text, flags=re.DOTALL)
text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text, flags=re.DOTALL)
# Split on blank lines → <p> blocks; single newlines → <br>
parts: list[str] = []
for para in re.split(r'\n{2,}', text.strip()):
para = para.strip()
if para:
parts.append(f"<p>{para.replace(chr(10), '<br>')}</p>")
return "".join(parts) # no raw \n — every \n becomes a <br> in Kavita
# --------------------------------------------------------------------------
# Main class
# --------------------------------------------------------------------------
class ComicInfoBuilder:
"""
Builds a ComicInfo.xml for a single manga chapter.
Constructor arguments
---------------------
manga_title : Title of the manga (used for the API search).
chapter : Chapter number (int, float, or str — e.g. "10.5").
Setter behaviour
----------------
* Changing `manga_title` discards both the cached API metadata
AND the current results (pages / cover).
* Changing `chapter` discards only the current results;
the API metadata is kept.
"""
def __init__(self, manga_title, chapter, *,
api_base_url: str = "https://api.mangabaka.dev/v1",
language: str = "en",
request_timeout: int = 30,
session: "requests.Session | None" = None,
volume_resolver: "MangaDexVolumeResolver | None" = None,
works_resolver: "MangaBakaWorksResolver | None" = None,
mal_resolver: "MALResolver | None" = None,
al_resolver: "AniListResolver | None" = None,
matches_cache: "MatchesCache | None" = None):
if not manga_title or not str(manga_title).strip():
raise ValueError("manga_title must not be empty.")
self._manga_title = str(manga_title).strip()
self._chapter = chapter
self.api_base_url = api_base_url.rstrip("/")
self.language = language
self.request_timeout = request_timeout
self._session = session or requests.Session()
self._session.headers.setdefault("User-Agent", "ComicInfoBuilder/1.0")
# Throttle every call to api.mangabaka.dev (idempotent — safe even
# when the session was already prepared by a parent class).
_apply_mangabaka_rate_limit(self._session)
self._volume_resolver = (volume_resolver
or MangaDexVolumeResolver(
request_timeout=request_timeout,
session=self._session))
self._works_resolver = (works_resolver
or MangaBakaWorksResolver(
api_base_url=api_base_url,
request_timeout=request_timeout,
session=self._session))
# Both resolvers are Singletons — they manage their own sessions/caches.
self._mal_resolver = mal_resolver or MALResolver(
request_timeout=request_timeout)
self._al_resolver = al_resolver or AniListResolver(
request_timeout=request_timeout)
self._matches_cache = matches_cache
self._metadata: "dict | None" = None
self._pages: list[dict] = []
self._cover_path: "Path | None" = None
self._suwayomi_data: dict = {}
# ----- Repr -----------------------------------------------------------
def __repr__(self) -> str:
return (f"ComicInfoBuilder(manga_title={self._manga_title!r}, "
f"chapter={self._chapter!r})")
# ======================================================================
# Properties / setters
# ======================================================================
@property
def manga_title(self) -> str:
return self._manga_title
@manga_title.setter
def manga_title(self, value):
value = str(value).strip()
if not value:
raise ValueError("manga_title must not be empty.")
if value == self._manga_title:
return
self._manga_title = value
self._metadata = None
self._clear_results()
@property
def chapter(self):
return self._chapter
@chapter.setter
def chapter(self, value):
if value == self._chapter:
return
self._chapter = value
self._clear_results()
def _clear_results(self) -> None:
self._pages = []
self._cover_path = None
self._suwayomi_data = {}
# ======================================================================
# Public XML functions
# ======================================================================
def to_xml_string(self, *, pretty: bool = True) -> str:
"""Returns the ComicInfo.xml as a string."""
tree = self._build_tree()
if pretty:
try:
ET.indent(tree, space=" ")
except AttributeError:
pass
body = ET.tostring(tree.getroot(), encoding="unicode")
return '<?xml version="1.0" encoding="UTF-8"?>\n' + body
def save_xml(self, path) -> Path:
"""
Writes the ComicInfo.xml to `path`.
If a directory is passed, ComicInfo.xml is created inside it.
Returns the actual file path used.
"""
path = Path(path)
if path.is_dir():
path = path / "ComicInfo.xml"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(self.to_xml_string(), encoding="utf-8")
return path
# ======================================================================
# Optional: analyse an image folder
# ======================================================================
def add_pages_from_folder(self, folder, *,
download_cover: bool = True,
cover_filename: str = "cover") -> dict:
"""
Scans a chapter image folder and populates <Page> entries.
Reads an existing Suwayomi ComicInfo.xml for supplementary fields.
Downloads the cover (volume-specific if a volume is found, otherwise
the series default cover).
"""
folder = Path(folder)
if not folder.is_dir():
raise NotADirectoryError(f"Folder not found: {folder}")
self._suwayomi_data = self._read_existing_comicinfo(folder)
self._cover_path = None
if download_cover:
self._cover_path = self._download_cover(folder, cover_filename)
cover_resolved = self._cover_path.resolve() if self._cover_path else None
story_images: list[Path] = []
for entry in folder.iterdir():
if not entry.is_file():
continue
if entry.suffix.lower() not in _IMAGE_EXTS:
continue
if cover_resolved and entry.resolve() == cover_resolved:
continue
story_images.append(entry)
story_images.sort(key=lambda p: _natural_key(p.name))
ordered: list[tuple[Path, str]] = []
if self._cover_path:
ordered.append((self._cover_path, "FrontCover"))
ordered.extend((img, "Story") for img in story_images)
self._pages = []
for index, (img_path, page_type) in enumerate(ordered):
width, height = self._image_dimensions(img_path)
try:
size = img_path.stat().st_size
except OSError:
size = None
self._pages.append({
"image": index,
"type": page_type,
"width": width,
"height": height,
"size": size,
"double": bool(width and height and width > height),
})
return {
"page_count": len(self._pages),
"cover": str(self._cover_path) if self._cover_path else None,
"suwayomi_fields": dict(self._suwayomi_data),
}
# ======================================================================
# Metadata retrieval (MangaBaka API)
# ======================================================================
def fetch_metadata(self, *, force: bool = False) -> dict:
"""Fetches (and caches) the series metadata. Pass force=True to refresh."""
return self._get_metadata(force=force)
def _get_metadata(self, *, force: bool = False) -> dict:
if self._metadata is not None and not force:
return self._metadata
series = self._search_best_series(self._manga_title)
if series is None:
raise RuntimeError(
f"No series found for '{self._manga_title}' on MangaBaka.")
if series.get("state") == "merged" and series.get("merged_with"):
series = self._fetch_series_by_id(series["merged_with"])
self._metadata = series
return series
def _search_best_series(self, title: str):
"""
Resolves `title` to a MangaBaka series.
Lookup order:
1. matches.json cache (if attached) — uses the stored series ID
to fetch the full series, skipping the search step entirely.
2. Fresh MangaBaka search — top hit. The match is persisted to
matches.json before being returned so it survives a crash.
"""
if self._matches_cache is not None:
cached = self._matches_cache.get(title)
if cached and cached.get("mangabakaId"):
try:
return self._fetch_series_by_id(cached["mangabakaId"])
except Exception as exc:
print(f"[ComicInfoBuilder] cached id "
f"{cached['mangabakaId']} for {title!r} failed "
f"({exc}); falling back to fresh search",
flush=True)
url = f"{self.api_base_url}/series/search"
resp = self._session.get(
url, params={"q": title, "type": _SEARCH_TYPES,
"page": 1, "limit": 1},
timeout=self.request_timeout)
resp.raise_for_status()
data = resp.json().get("data") or []
series = data[0] if data else None
if series and self._matches_cache is not None:
self._matches_cache.add(
title,
mangabaka_id=series.get("id"),
mangabaka_name=series.get("title") or "",
image_url=_pick_cover_url(series.get("cover")),
)
return series
def _fetch_series_by_id(self, series_id) -> dict:
url = f"{self.api_base_url}/series/{series_id}"
resp = self._session.get(url, timeout=self.request_timeout)
resp.raise_for_status()
data = resp.json().get("data")
if not data:
raise RuntimeError(f"Series with ID {series_id} not found.")
return data
# ======================================================================
# XML construction
# ======================================================================
def _build_tree(self) -> "ET.ElementTree":
md = self._get_metadata()
sd = self._suwayomi_data
volume = self._determine_volume()
work = self._get_work_for_volume(md, volume) if volume else None
root = ET.Element("ComicInfo", {
"xmlns:xsd": "http://www.w3.org/2001/XMLSchema",
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
})
def add(tag: str, value) -> None:
if value is None:
return
text = str(value).strip()
if text:
ET.SubElement(root, tag).text = text
# ----- Title / Series -----------------------------------------------
add("Title", sd.get("Title") or f"Chapter {self._chapter}")
add("Series", md.get("title") or self._manga_title)
add("LocalizedSeries",
md.get("native_title") or md.get("romanized_title"))
add("SeriesSort", self._get_sort_title(md))
add("Number", sd.get("Number") or self._chapter)
add("Count", md.get("total_chapters"))
add("Volume", volume)
# ----- Description with MAL stats -----------------------------------
# Prefer the MAL ID from MangaBaka's source map — avoids an extra
# Jikan title-search request and is more reliable than fuzzy matching.
mal_id = (self._mal_id_from_source(md)
or self._mal_resolver.find_mal_id(
md.get("title") or self._manga_title))
al_id = self._al_id_from_source(md)
mal_stats = self._mal_resolver.get_stats(mal_id)
add("Summary", self._build_summary(md, sd, mal_stats))
# ----- Release date -------------------------------------------------
# Volume publication date takes precedence over the chapter date.
vol_year, vol_month, vol_day = self._parse_work_date(work)
add("Year", vol_year or sd.get("Year") or md.get("year"))
add("Month", vol_month or sd.get("Month"))
add("Day", vol_day or sd.get("Day"))
# ----- Contributors -------------------------------------------------
add("Writer", ", ".join(md.get("authors") or []))
add("Penciller", ", ".join(md.get("artists") or []))
add("Translator", sd.get("Translator"))
# ----- Publisher ----------------------------------------------------
eng_pub = self._publishers_by_type(md, "English")
orig_pub = self._publishers_by_type(md, "Original")
add("Publisher", eng_pub or orig_pub)
if eng_pub and orig_pub:
add("Imprint", orig_pub)
# ----- Genres / Tags ------------------------------------------------
# Genres come back as lowercase snake_case ("slice_of_life"); convert
# to display form ("Slice Of Life") so Kavita / readers show them
# consistently with the (already-titled-cased) Tags field.
add("Genre", ", ".join(_format_term(g) for g in (md.get("genres") or [])))
add("Tags", ", ".join(_format_term(t) for t in (md.get("tags") or [])))
# ----- Characters — MAL first, AniList fallback ---------------------
characters = self._mal_resolver.get_characters(mal_id)
if not characters and al_id:
characters = self._al_resolver.get_characters(al_id)
add("Characters", ", ".join(characters) if characters else None)
# ----- Web links ----------------------------------------------------
add("Web", " ".join(self._collect_web_links(md, sd)))
# ----- Miscellaneous ------------------------------------------------
add("LanguageISO", self.language)
add("Manga", self._manga_flag(md))
add("AgeRating", _AGE_RATING_MAP.get(md.get("content_rating"), "Unknown"))
if md.get("rating") is not None:
try:
# MangaBaka rating is on a 0..100 scale -> ComicInfo
# CommunityRating uses 0..5.
add("CommunityRating", round(float(md["rating"]) / 20, 1))
except (TypeError, ValueError):
pass
# ----- ISBN (GTIN) from volume work ---------------------------------
identifiers = (work or {}).get("identifiers") or []
isbn = identifiers[0].get("id") if identifiers else None
add("GTIN", isbn)
# ----- SeriesGroup from related works -------------------------------
add("SeriesGroup", self._determine_series_group(md))
# ----- Alternate title notes ----------------------------------------
add("Notes", self._build_notes(md))
# ----- Pages --------------------------------------------------------
if self._pages:
add("PageCount", len(self._pages))
pages_el = ET.SubElement(root, "Pages")
for page in self._pages:
attrs = {"Image": str(page["image"]), "Type": page["type"]}
if page.get("size") is not None:
attrs["ImageSize"] = str(page["size"])
if page.get("width"):
attrs["ImageWidth"] = str(page["width"])
if page.get("height"):
attrs["ImageHeight"] = str(page["height"])
if page.get("double"):
attrs["DoublePage"] = "true"
ET.SubElement(pages_el, "Page", attrs)
return ET.ElementTree(root)
# ======================================================================
# Volume determination
# ======================================================================
def _determine_volume(self) -> "str | None":
"""
Resolves the volume for the current chapter via MangaDex.
Falls back to estimation when the chapter is absent from MangaDex.
Returns None if no volume can be determined.
"""
md = self._get_metadata()
try:
manga_id = self._mangadex_id_from_source(md)
if not manga_id:
manga_id = self._volume_resolver.find_manga_id(
md.get("native_title") or self._manga_title)
if not manga_id:
return None
series_id = str(md.get("id") or "")
page_counts = {}
if series_id:
page_counts = self._works_resolver.get_page_counts(series_id)
return self._volume_resolver.volume_for_chapter(
manga_id, self._chapter,
volume_page_counts=page_counts or None)
except Exception:
return None
def _get_work_for_volume(self, md: dict,
volume: "str | None") -> "dict | None":
"""Returns the MangaBaka work dict for the current volume, or None."""
if not volume:
return None
series_id = str(md.get("id") or "")
if not series_id:
return None
try:
return self._works_resolver.get_work_for_volume(series_id, volume)
except Exception:
return None
# ======================================================================
# Cover download
# ======================================================================
def _download_cover(self, folder: Path, cover_filename: str) -> "Path | None":
"""
Downloads the cover for the current chapter/volume.
If a volume is known and a volume-specific cover exists in MangaBaka
works, that cover is used. Otherwise the series default cover is
downloaded (raw variant preferred).
"""
md = self._get_metadata()
volume = self._determine_volume()
cover_url: "str | None" = None
if volume:
series_id = str(md.get("id") or "")
if series_id:
try:
cover_url = self._works_resolver.get_cover_for_volume(
series_id, volume)
except Exception:
pass
if not cover_url:
cover_url = _pick_cover_url(md.get("cover"))
if not cover_url:
return None
try:
resp = self._session.get(cover_url, timeout=self.request_timeout)
resp.raise_for_status()
except requests.RequestException:
return None
ext = _guess_extension(cover_url, resp.headers.get("Content-Type", ""))
target = folder / f"{cover_filename}{ext}"
target.write_bytes(resp.content)
return target
# ======================================================================
# Series group
# ======================================================================
def _determine_series_group(self, md: dict) -> "str | None":
"""
Determines SeriesGroup from MangaBaka's relationships_v2 field.
- If the series has a 'parent' relationship entry → fetch the parent
series and return its MangaBaka title (so arcs/sequels appear under
the root series in Kavita).
- Otherwise → return the series' own title (it is the root, or a
standalone series with no parent).
"""
for rel in (md.get("relationships_v2") or []):
if rel.get("relation_type") == "parent":
parent_id = rel.get("to_series_id")
if parent_id is not None:
try:
parent_md = self._fetch_series_by_id(parent_id)
parent_title = parent_md.get("title")
if parent_title:
return parent_title
except Exception:
pass
break
return md.get("title") or self._manga_title
# ======================================================================
# Title helpers
# ======================================================================
def _get_sort_title(self, md: dict) -> "str | None":
"""
Returns the SeriesSort title in the configured language.
Looks for an alt-title with matching language code first;
falls back to the primary title.
"""
lang = self.language.lower()
alt_titles = self._collect_alt_titles(md)
if lang in alt_titles:
return alt_titles[lang]
# For 'en' the primary MangaBaka title is usually already English
return md.get("title") or self._manga_title
def _collect_alt_titles(self, md: dict) -> "dict[str, str]":
"""
Returns {lang_code: title} for EN, DE, JP kanji and JP romaji.
MangaBaka stores alt-titles in the `titles` list, where each entry is
a dict {language, title, traits, is_primary, note}.
Important caveats observed against the real API:
* `romanized_title` is the romanization of whatever the series'
native script is — for a Japanese manga with a Korean licence it
can hold the Korean romanization, NOT the Japanese romaji.
Always prefer `titles[language="ja-Latn"]` for romaji instead.
* `native_title` holds the kanji form for Japanese manga, but
`titles[language="ja", traits contains "native"]` is more
reliable when present.
* Each language can have several entries; primary + official
traits win over generic ones.
"""
titles = md.get("titles") or md.get("alt_titles") or []
def pick(language_codes: tuple, prefer_trait: "str | None" = None
) -> "str | None":
"""Picks the best title entry for any of the given language codes."""
if not isinstance(titles, list):
return None
best_score = -1
best_title: "str | None" = None
for entry in titles:
if not isinstance(entry, dict):
continue
lang = (entry.get("language") or entry.get("lang") or "").lower()
if lang not in language_codes:
continue
title = entry.get("title")
if not title:
continue
traits = entry.get("traits") or []
score = 0
if prefer_trait and prefer_trait in traits:
score += 4
if "official" in traits:
score += 2
if entry.get("is_primary"):
score += 1
if score > best_score:
best_score, best_title = score, title
return best_title
result: dict[str, str] = {}
# JP kanji (prefer entry with "native" trait, fall back to native_title)
kanji = pick(("ja",), prefer_trait="native") or md.get("native_title")
if kanji:
result["jp"] = kanji
# JP romaji — explicitly from "ja-Latn" entries. Do NOT fall back to
# `romanized_title` blindly; that field can hold a non-Japanese
# romanization (e.g. Korean) for the same series.
romaji = pick(("ja-latn", "ja-romaji"))
if not romaji:
# Heuristic fallback only when romanized_title looks Latin
rt = md.get("romanized_title") or ""
if rt and all(ord(c) < 128 for c in rt):
romaji = rt
if romaji:
result["romaji"] = romaji
# English (prefer official + primary)
en = pick(("en",))
if not en:
en = md.get("title") if md.get("title") else None
if en:
result["en"] = en
# German
de = pick(("de",))
if de:
result["de"] = de
return result
def _collect_all_alt_titles(self, md: dict) -> "dict[str, list[str]]":
"""
Returns all known title variants grouped by language/script.
Groups collected (skipped when empty):
"en" English (language = "en")
"de" German (language = "de")
"ja" Japanese native kanji (language = "ja")
"ja-romaji" Japanese romanized (language = "ja-Latn" / "ja-romaji")
"ko" Korean native (language = "ko")
"ko-romaji" Korean romanized (language = "ko-Latn" / "ko-romaji")
"zh" Chinese native (language = "zh" / "zh-hk" / "zh-tw" / …)
"zh-romaji" Chinese romanized (language = "zh-Latn")
All variants are included (not just primary), preserving API order.
Duplicates within a group are removed.
"""
_GROUPS: "dict[str, tuple]" = {
"en": ("en",),
"de": ("de",),
"ja": ("ja",),
"ja-romaji": ("ja-latn", "ja-romaji"),
"ko": ("ko",),
"ko-romaji": ("ko-latn", "ko-romaji"),
"zh": ("zh", "zh-hk", "zh-tw", "zh-hans", "zh-hant"),
"zh-romaji": ("zh-latn",),
}
# Pre-build a flat lang → group mapping for O(1) lookup
lang_to_group: "dict[str, str]" = {
lang: group
for group, langs in _GROUPS.items()
for lang in langs
}
result: "dict[str, list[str]]" = {}
seen: "dict[str, set[str]]" = {}
for entry in (md.get("titles") or md.get("alt_titles") or []):
if not isinstance(entry, dict):
continue
lang = (entry.get("language") or entry.get("lang") or "").lower()
group = lang_to_group.get(lang)
if not group:
continue
title = (entry.get("title") or "").strip()
if not title:
continue
if group not in result:
result[group] = []
seen[group] = set()
if title not in seen[group]:
result[group].append(title)
seen[group].add(title)
return result
# ======================================================================
# Summary / notes
# ======================================================================
def _build_summary(self, md: dict, sd: dict,
mal_stats: "dict | None") -> "str | None":
"""
Builds <Summary> as HTML (Kavita supports HTML in this field).
Structure (top → bottom):
1. MAL statistics — HTML link + table with padded columns
2. Series description — Markdown converted to HTML
3. Alternate titles — HTML table
"""
# Inline style applied to label cells for readable column spacing.
_TD = 'style="padding-right:1.5em"'
parts: list[str] = []
# 1. MAL stats table (top) ----------------------------------------
if mal_stats:
url = mal_stats.get("url", "")
as_of = mal_stats.get("as_of", "")
score = mal_stats.get("score")
rank = mal_stats.get("rank")
scored = mal_stats.get("scored_by")
pop = mal_stats.get("popularity")
members = mal_stats.get("members")
favs = mal_stats.get("favorites")
rows: list[str] = []
if score is not None: rows.append(f"<tr><td {_TD}>Score</td><td>{score}</td></tr>")
if rank is not None: rows.append(f"<tr><td {_TD}>Ranked</td><td>#{rank}</td></tr>")
if scored is not None: rows.append(f"<tr><td {_TD}>Scored by</td><td>{scored:,} users</td></tr>")
if pop is not None: rows.append(f"<tr><td {_TD}>Popularity</td><td>#{pop}</td></tr>")
if members is not None: rows.append(f"<tr><td {_TD}>Members</td><td>{members:,}</td></tr>")
if favs is not None: rows.append(f"<tr><td {_TD}>Favorites</td><td>{favs:,}</td></tr>")
if rows:
link = f'<a href="{url}" target="_blank">MyAnimeList</a>' if url else "MyAnimeList"
parts.append(f"<p>{link} stats as of {as_of}:</p><table>{''.join(rows)}</table>")
# 2. Description — Markdown → HTML (middle) -----------------------
desc_raw = (md.get("description") or sd.get("Summary") or "").strip()
if desc_raw:
parts.append(_md_to_html(desc_raw))
# 3. Alternate titles table (bottom) — all variants per language ------
all_alt = self._collect_all_alt_titles(md)
if all_alt:
label_map = {
"en": "EN",
"de": "DE",
"ja": "JA",
"ja-romaji": "JA Romaji",
"ko": "KO",
"ko-romaji": "KO Romaji",
"zh": "ZH",
"zh-romaji": "ZH Romaji",
}
alt_rows: list[str] = []
for group in ("en", "de", "ja", "ja-romaji",
"ko", "ko-romaji", "zh", "zh-romaji"):
titles = all_alt.get(group)
if not titles:
continue
label = label_map[group]
cell = "<br>".join(titles)
alt_rows.append(f"<tr><td {_TD}>{label}</td><td>{cell}</td></tr>")
if alt_rows:
parts.append(f"<table>{''.join(alt_rows)}</table>")
return "<br>".join(parts) if parts else None
def _build_notes(self, md: dict) -> "str | None":
"""Builds the <Notes> field with the MangaBaka metadata source URL."""
series_id = str(md.get("id") or "")
return f"Metadata source: https://mangabaka.org/{series_id}" if series_id else None
# ======================================================================
# Static helpers
# ======================================================================
@staticmethod
def _parse_work_date(work: "dict | None") -> tuple:
"""Returns (year, month, day) strings from a MangaBaka work dict."""
if not work:
return (None, None, None)
raw = (work.get("release_date") or work.get("publication_date") or "")
if not raw:
return (None, None, None)
parts = str(raw).split("-")
year = parts[0] if len(parts) > 0 and parts[0] else None
month = parts[1] if len(parts) > 1 and parts[1] else None
day = parts[2] if len(parts) > 2 and parts[2] else None
return (year, month, day)
@staticmethod
def _mangadex_id_from_source(md: dict) -> "str | None":
for raw_key, info in (md.get("source") or {}).items():
if _normalise_key(raw_key) in ("mangadex", "mangadexorg", "md"):
if isinstance(info, dict) and info.get("id") is not None:
return str(info["id"])
return None
@staticmethod
def _mal_id_from_source(md: dict) -> "int | None":
for raw_key, info in (md.get("source") or {}).items():
if _normalise_key(raw_key) in ("myanimelist", "mal"):
if isinstance(info, dict):
mid = info.get("id")
if mid is not None:
try:
return int(mid)
except (TypeError, ValueError):
pass
return None
@staticmethod
def _al_id_from_source(md: dict) -> "int | None":
for raw_key, info in (md.get("source") or {}).items():
if _normalise_key(raw_key) == "anilist":
if isinstance(info, dict):
mid = info.get("id")
if mid is not None:
try:
return int(mid)
except (TypeError, ValueError):
pass
return None
@staticmethod
def _publishers_by_type(md: dict, ptype: str) -> "str | None":
names = [p.get("name") for p in (md.get("publishers") or [])
if p.get("type") == ptype and p.get("name")]
return ", ".join(names) if names else None
@staticmethod
def _manga_flag(md: dict) -> str:
mtype = (md.get("type") or "").lower()
if mtype == "manga":
return "YesAndRightToLeft"
if mtype in ("manhwa", "manhua", "oel"):
return "Yes"
return "Unknown"
def _collect_web_links(self, md: dict, sd: dict) -> list[str]:
links: list[str] = []
links.extend(l for l in (md.get("links") or []) if l)
for raw_key, info in (md.get("source") or {}).items():
template = _TRACKER_URL_TEMPLATES.get(_normalise_key(raw_key))
if not template or not isinstance(info, dict):
continue
source_id = info.get("id")
if source_id is not None:
links.append(template.format(id=source_id))
if sd.get("Web"):
links.extend(str(sd["Web"]).split())
seen: set[str] = set()
unique: list[str] = []
for link in links:
if link not in seen:
seen.add(link)
unique.append(link)
return unique
@staticmethod
def _read_existing_comicinfo(folder: Path) -> dict:
xml_path = folder / "ComicInfo.xml"
if not xml_path.is_file():
return {}
try:
root = ET.parse(xml_path).getroot()
except ET.ParseError:
return {}
wanted = {"Title", "Series", "Number", "Summary", "Writer",
"Penciller", "Translator", "Genre", "Web",
"Year", "Month", "Day"}
data: dict = {}
for child in root:
tag = child.tag.split("}")[-1]
if tag in wanted and child.text and child.text.strip():
data[tag] = child.text.strip()
return data
@staticmethod
def _image_dimensions(path: Path):
if not _HAS_PIL:
return (None, None)
try:
with Image.open(path) as im:
return im.size
except Exception:
return (None, None)
# --------------------------------------------------------------------------
# Module-level helpers (shared with MangaBakaWorksResolver logic)
# --------------------------------------------------------------------------
def _pick_cover_url(cover) -> "str | None":
"""
Selects the best cover URL from a MangaBaka cover object.
Real API shape (from `GET /v1/series/{id}` and `/works`):
{
"raw": {"url": "...", "size": ..., "height": ..., "width": ...},
"x150": {"x1": "...", "x2": "...", "x3": "..."},
"x250": {"x1": "...", "x2": "...", "x3": "..."},
"x350": {"x1": "...", "x2": "...", "x3": "..."}
}
Order of preference: raw original > x350@x3 > x250@x3 > x150@x3
(falling through to lower densities and sizes as needed).
"""
if not cover:
return None
if isinstance(cover, str):
return cover
if not isinstance(cover, dict):
return None
# 1) Preferred: the unscaled "raw" image
raw = cover.get("raw")
if isinstance(raw, dict):
url = raw.get("url")
if isinstance(url, str) and url:
return url
elif isinstance(raw, str) and raw:
return raw
# 2) Fallback: size-keyed variants, largest first, highest density first
for size_key in ("x350", "x250", "x150"):
variant = cover.get(size_key)
if isinstance(variant, dict):
for density in ("x3", "x2", "x1"):
url = variant.get(density)
if isinstance(url, str) and url:
return url
elif isinstance(variant, str) and variant:
return variant
# 3) Last-ditch fallback: any http URL anywhere in the structure
for val in cover.values():
if isinstance(val, str) and val.startswith("http"):
return val
if isinstance(val, dict):
for sub in val.values():
if isinstance(sub, str) and sub.startswith("http"):
return sub
return None
def _guess_extension(url: str, content_type: str) -> str:
url_ext = Path(url.split("?")[0]).suffix.lower()
if url_ext in _IMAGE_EXTS:
return url_ext
ct = (content_type or "").lower()
if "png" in ct: return ".png"
if "webp" in ct: return ".webp"
if "gif" in ct: return ".gif"
return ".jpg"
# --------------------------------------------------------------------------
# Usage example
# --------------------------------------------------------------------------
if __name__ == "__main__":
builder = ComicInfoBuilder("Yofukashi no Uta", 66)
builder.add_pages_from_folder(
r"\\192.168.2.2\root\Temp\managdl\mangas\ComicK Fanmade (EN)"
r"\Yofukashi no Uta\Official_Chapter 66")
builder.save_xml(
r"\\192.168.2.2\root\Temp\managdl\mangas\ComicK Fanmade (EN)"
r"\Yofukashi no Uta\Official_Chapter 66\ComicInfo.xml")
# Setter behaviour:
# builder.chapter = 2 # only results discarded, metadata is kept
# builder.manga_title = "X" # metadata + results discarded