From 06d6354d29ae42aebb57d53ae91e8da273c4f9601eaff42d9ba9b6ebcca3b18b Mon Sep 17 00:00:00 2001 From: JohannesBOT Date: Fri, 12 Jun 2026 09:53:25 +0200 Subject: [PATCH] improvements --- .gitea/workflows/release.yml | 34 ++++++ main.py | 11 +- src/AniListResolver.py | 19 +-- src/KavitaClient.py | 61 ++++++++-- src/KavitaPersonUpdater.py | 201 +++++-------------------------- src/LightNovelMetadataBuilder.py | 47 ++++---- src/LightNovelOrchestrator.py | 3 +- src/MALResolver.py | 19 +-- src/MatchesCache.py | 28 ++--- src/MatchesWebApp.py | 25 ++-- src/TextUtils.py | 45 +++++++ 11 files changed, 234 insertions(+), 259 deletions(-) create mode 100644 .gitea/workflows/release.yml create mode 100644 src/TextUtils.py diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml new file mode 100644 index 0000000..83c6d07 --- /dev/null +++ b/.gitea/workflows/release.yml @@ -0,0 +1,34 @@ +name: Release + +on: + push: + tags: + - "v[0-9]+.[0-9]+.[0-9]+" + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + run: | + git clone ${{ github.server_url }}/${{ github.repository }}.git . + git checkout ${{ github.sha }} + + - name: Login to Gitea Registry + run: | + echo "${{ secrets.REGISTRY_PASSWORD }}" | \ + docker login https://gitea.johannesbot.de -u ${{ secrets.REGISTRY_USER }} --password-stdin + + - name: Build Image + run: | + VERSION="${GITHUB_REF_NAME#v}" + docker build \ + -t gitea.johannesbot.de/johannesbot/kavita-lightnovel-metadata-fetcher:${VERSION} \ + -t gitea.johannesbot.de/johannesbot/kavita-lightnovel-metadata-fetcher:${GITHUB_REF_NAME} \ + . + + - name: Push Image + run: | + VERSION="${GITHUB_REF_NAME#v}" + docker push gitea.johannesbot.de/johannesbot/kavita-lightnovel-metadata-fetcher:${VERSION} + docker push gitea.johannesbot.de/johannesbot/kavita-lightnovel-metadata-fetcher:${GITHUB_REF_NAME} diff --git a/main.py b/main.py index 5fdf5d0..584c53d 100644 --- a/main.py +++ b/main.py @@ -38,12 +38,15 @@ try: except ImportError: pass -# Make src/ importable when running as `python main.py`. +# Make src/ importable when running as `python main.py`. Import the +# modules by their plain names (not `src.X`) so they are the same module +# objects the src-internal imports resolve to — `src.X` would load every +# module twice under two names. sys.path.insert(0, str(Path(__file__).resolve().parent / "src")) -from src.MatchesCache import MatchesCache # noqa: E402 -from src.LightNovelOrchestrator import LightNovelOrchestrator # noqa: E402 -from src.MatchesWebApp import MatchesWebApp # noqa: E402 +from MatchesCache import MatchesCache # noqa: E402 +from LightNovelOrchestrator import LightNovelOrchestrator # noqa: E402 +from MatchesWebApp import MatchesWebApp # noqa: E402 def _env_str(name: str, default: "str | None" = None, diff --git a/src/AniListResolver.py b/src/AniListResolver.py index 10591c8..9725e60 100644 --- a/src/AniListResolver.py +++ b/src/AniListResolver.py @@ -32,12 +32,12 @@ Dependencies from __future__ import annotations import datetime -import difflib import time import requests from MediaResolver import MediaResolver +from TextUtils import best_similarity # -------------------------------------------------------------------------- @@ -469,18 +469,11 @@ class AniListResolver(MediaResolver): def _score_title(query: str, entry: dict) -> float: """Returns the best title-similarity score for an AniList media entry.""" title_obj = entry.get("title") or {} - candidates = [ - title_obj.get("romaji") or "", - title_obj.get("english") or "", - title_obj.get("native") or "", - ] - best = 0.0 - q = query.lower() - for t in candidates: - if t: - ratio = difflib.SequenceMatcher(None, q, t.lower()).ratio() - best = max(best, ratio) - return best + return best_similarity(query, ( + title_obj.get("romaji"), + title_obj.get("english"), + title_obj.get("native"), + )) # -------------------------------------------------------------------------- diff --git a/src/KavitaClient.py b/src/KavitaClient.py index fd0e77a..11463a1 100644 --- a/src/KavitaClient.py +++ b/src/KavitaClient.py @@ -194,25 +194,56 @@ class KavitaClient: return {} # ------------------------------------------------------------------ - # Series cover upload + # Persons + # ------------------------------------------------------------------ + def search_persons(self, name: str) -> list[dict]: + """Returns PersonDto entries matching `name` (Kavita's own search).""" + r = self._session.get( + f"{self._base}/api/Person/search", + params={"queryString": name}, timeout=self._timeout) + r.raise_for_status() + return r.json() or [] + + def update_person(self, payload: dict) -> None: + """Writes a person record (malId, aniListId, description, …).""" + r = self._session.post(f"{self._base}/api/Person/update", + json=payload, timeout=self._timeout) + r.raise_for_status() + + # ------------------------------------------------------------------ + # Cover uploads # ------------------------------------------------------------------ def upload_series_cover(self, series_id: int, image_url: str, *, lock: bool = False) -> None: - """ - Downloads an external image and uploads it as the series cover. + """Downloads an external image and uploads it as the series cover.""" + self._upload_cover("/api/Upload/series", series_id, image_url, lock) - Mirrors the cover-upload trick used in KavitaPersonUpdater: - Kavita's `/api/Upload/series` accepts a raw base64 blob (no - ``data:`` prefix) in the ``url`` field. + def upload_person_cover(self, person_id: int, image_url: str, *, + lock: bool = False) -> None: + """Downloads an external image and uploads it as a person cover.""" + self._upload_cover("/api/Upload/person", person_id, image_url, lock) + + def _upload_cover(self, endpoint: str, entity_id: int, + image_url: str, lock: bool) -> None: + """ + Shared cover-upload path. Kavita's upload endpoints accept a raw + base64 blob (no ``data:`` prefix) in the ``url`` field — a data + URI or the two-step upload-by-url flow are rejected with HTTP 400 + (verified against Kavita 0.9.0.2). """ img = self._image_session.get(image_url, timeout=self._timeout) img.raise_for_status() b64 = base64.b64encode(img.content).decode() r = self._session.post( - f"{self._base}/api/Upload/series", - json={"id": series_id, "url": b64, "lockCover": lock}, + f"{self._base}{endpoint}", + json={"id": entity_id, "url": b64, "lockCover": lock}, timeout=self._timeout) - r.raise_for_status() + if r.status_code >= 400: + # Include the body excerpt — Kavita's upload errors carry the + # actual reason there, not in the status line. + raise requests.HTTPError( + f"{endpoint} HTTP {r.status_code}: {_short_body(r)}", + response=r) # ------------------------------------------------------------------ # Generic GET helper (used by callers that need a response object) @@ -227,3 +258,15 @@ class KavitaClient: return self._session.post(f"{self._base}{path}", json=json, params=params, timeout=self._timeout) + + +def _short_body(resp: requests.Response, limit: int = 400) -> str: + """Returns the response body trimmed to `limit` chars for error messages.""" + try: + text = resp.text or "" + except Exception: + return "" + text = text.strip().replace("\n", " ").replace("\r", " ") + if len(text) > limit: + text = text[:limit] + "…" + return text or "" diff --git a/src/KavitaPersonUpdater.py b/src/KavitaPersonUpdater.py index a1fbee1..1a65692 100644 --- a/src/KavitaPersonUpdater.py +++ b/src/KavitaPersonUpdater.py @@ -15,46 +15,22 @@ the updater: an 'about' text (requires an extra Jikan request per character; only performed when update_descriptions=True). -Kavita API version ------------------- +All HTTP traffic to Kavita goes through the shared :class:`KavitaClient` +(`/api/Person/search`, `/api/Person/update`, `/api/Upload/person`). + Tested against Kavita 0.9.0.2. - -Authentication --------------- -Uses the `x-api-key` header (API key from Kavita user settings). -No JWT login is required. - -Relevant endpoints (Kavita 0.9.0.2) -------------------------------------- - GET /api/Person/search find persons by name / alias - POST /api/Person/update write metadata (malId, description, …) - POST /api/Upload/person set cover image (base64 data URI) - POST /api/Upload/upload-by-url download an external URL to temp storage - (used as an alternative upload path) - -Cover upload flow ------------------ -The image is downloaded locally, base64-encoded, and sent as a data URI -to POST /api/Upload/person. This is more reliable than the -upload-by-url → upload/person two-step because it avoids Kavita's temp -file handling (which had known issues in 0.8.x – 0.9.x, GitHub #3900). - -Dependencies ------------- - requests -> pip install requests """ from __future__ import annotations -import base64 import datetime -import difflib -import re import requests +from KavitaClient import KavitaClient from MALResolver import MALResolver from AniListResolver import AniListResolver +from TextUtils import best_similarity, paragraphs_to_html class KavitaPersonUpdater: @@ -63,41 +39,22 @@ class KavitaPersonUpdater: Parameters ---------- - kavita_base_url : Base URL of the Kavita server, e.g. "http://192.168.2.2:5000" - api_key : Kavita API key (Settings → User → API key) + client : Shared KavitaClient (session, auth, cover uploads) mal_resolver : Shared MALResolver singleton (created automatically if omitted) - request_timeout : HTTP timeout in seconds for both Kavita and image requests + al_resolver : Shared AniListResolver singleton (created automatically if omitted) min_name_score : Minimum difflib similarity ratio (0–1) required to accept a Kavita person as a match for a MAL name. Default 0.80. """ - def __init__(self, kavita_base_url: str, api_key: str, *, + def __init__(self, client: KavitaClient, *, mal_resolver: "MALResolver | None" = None, al_resolver: "AniListResolver | None" = None, - request_timeout: int = 30, min_name_score: float = 0.80): - self._base = kavita_base_url.rstrip("/") - self._timeout = request_timeout + self._client = client self._min_score = min_name_score self._mal = mal_resolver or MALResolver() self._al = al_resolver or AniListResolver() - # Session used for Kavita API calls. - self._session = requests.Session() - self._session.headers.update({ - "x-api-key": api_key, - "Content-Type": "application/json", - "Accept": "application/json", - }) - - # Plain session used to download external images (MAL CDN etc.). - # Must NOT carry the Kavita API headers — Accept: application/json - # would prevent MAL CDN from returning the image bytes. - self._image_session = requests.Session() - self._image_session.headers.update({ - "User-Agent": "KavitaPersonUpdater/1.0", - }) - # Cache: normalised name -> list of PersonDto dicts (best matches first) self._person_search_cache: dict[str, list[dict]] = {} @@ -230,29 +187,17 @@ class KavitaPersonUpdater: return self._person_search_cache[key] try: - resp = self._session.get( - f"{self._base}/api/Person/search", - params={"queryString": name}, - timeout=self._timeout, - ) - resp.raise_for_status() - persons: list[dict] = resp.json() or [] + persons = self._client.search_persons(name) except requests.RequestException: self._person_search_cache[key] = [] return [] - def score(p: dict) -> float: - candidates = [p.get("name") or ""] - candidates += [a for a in (p.get("aliases") or []) if a] - best = 0.0 - q = key - for c in candidates: - r = difflib.SequenceMatcher(None, q, c.lower()).ratio() - best = max(best, r) - return best - - ranked = sorted(persons, key=score, reverse=True) - filtered = [p for p in ranked if score(p) >= self._min_score] + scored = [] + for p in persons: + candidates = [p.get("name")] + list(p.get("aliases") or []) + scored.append((best_similarity(key, candidates), p)) + scored.sort(key=lambda pair: pair[0], reverse=True) + filtered = [p for score, p in scored if score >= self._min_score] self._person_search_cache[key] = filtered return filtered @@ -323,12 +268,7 @@ class KavitaPersonUpdater: "aniListId": al_id if needs_al_id else (current_al_id or None), } try: - resp = self._session.post( - f"{self._base}/api/Person/update", - json=payload, - timeout=self._timeout, - ) - resp.raise_for_status() + self._client.update_person(payload) changed = True except requests.RequestException as e: if errors is not None: @@ -350,88 +290,21 @@ class KavitaPersonUpdater: and bool(person.get("coverImage")) ) if image_url and not already_uploaded: - if self._upload_cover(person_id, image_url, - person_name=person_name, - errors=errors): + try: + self._client.upload_person_cover(person_id, image_url) changed = True + except requests.RequestException as e: + if errors is not None: + errors.append( + f"cover upload failed for #{person_id} " + f"'{person_name}' ({image_url}): {e}") return changed - # ------------------------------------------------------------------ - # Internal: cover upload - # ------------------------------------------------------------------ - def _upload_cover(self, person_id: int, image_url: str, - lock: bool = False, *, - person_name: str = "", - errors: "list | None" = None) -> bool: - """ - Uploads a cover image to a Kavita person. - - The image is downloaded with the plain (header-less) image session - and posted to `POST /api/Upload/person` as a raw base64 string in - the `url` field. - - Notes on protocol quirks discovered against Kavita 0.9.0.2: - - The two-step `upload-by-url` -> `Upload/person` flow returns - "Unable to save cover image to Person" (HTTP 400). - - A `data:image/jpeg;base64,...` data URI is rejected with the - same error. - - Only the raw base64 blob (no prefix) is accepted. - """ - label = (f"#{person_id} '{person_name}'" - if person_name else f"#{person_id}") - - # 1) Download the image with a clean session — the Kavita session's - # `Accept: application/json` header makes some CDNs refuse to - # return image bytes. - try: - img_resp = self._image_session.get(image_url, - timeout=self._timeout) - img_resp.raise_for_status() - except requests.RequestException as e: - if errors is not None: - errors.append( - f"image download failed for {label} ({image_url}): {e}") - return False - - b64 = base64.b64encode(img_resp.content).decode() - - # 2) POST the raw base64 blob. - try: - resp = self._session.post( - f"{self._base}/api/Upload/person", - json={"id": person_id, "url": b64, "lockCover": lock}, - timeout=self._timeout, - ) - if resp.status_code >= 400: - if errors is not None: - errors.append( - f"Upload/person HTTP {resp.status_code} for {label}: " - f"{_short_body(resp)}") - return False - return True - except requests.RequestException as e: - if errors is not None: - errors.append( - f"Upload/person failed for {label}: {e}") - return False - # -------------------------------------------------------------------------- # Module helpers: description builders # -------------------------------------------------------------------------- -def _plain_to_html(text: str) -> str: - """Converts plain text with paragraph breaks to compact HTML (no raw \\n).""" - if not text: - return "" - parts: list[str] = [] - for para in re.split(r"\n{2,}", text.strip()): - para = para.strip() - if para: - parts.append(f"

{para.replace(chr(10), '
')}

") - return "".join(parts) - - def _format_birthday(birthday: str) -> str: """Converts an ISO 8601 birthday string to "D Month YYYY".""" if not birthday: @@ -457,7 +330,7 @@ def _build_character_description(details: dict) -> str: parts.append(f'

Favorites: {favorites:,}

') about = (details.get("about") or "").strip() if about: - parts.append(_plain_to_html(about)) + parts.append(paragraphs_to_html(about)) return "
".join(parts) @@ -501,33 +374,19 @@ def _build_person_description(details: dict) -> str: parts.append(f'{"".join(rows)}
') about = (details.get("about") or "").strip() if about: - parts.append(_plain_to_html(about)) + parts.append(paragraphs_to_html(about)) return "
".join(parts) -# -------------------------------------------------------------------------- -# Module helper -# -------------------------------------------------------------------------- -def _short_body(resp: requests.Response, limit: int = 400) -> str: - """Returns the response body trimmed to `limit` chars for error logging.""" - try: - text = resp.text or "" - except Exception: - return "" - text = text.strip().replace("\n", " ").replace("\r", " ") - if len(text) > limit: - text = text[:limit] + "…" - return text or "" - - # -------------------------------------------------------------------------- # Usage example # -------------------------------------------------------------------------- if __name__ == "__main__": - KAVITA_URL = "http://192.168.2.2:5000" - KAVITA_KEY = "Sq4a3hcV171dn3gzCl0K4eN7hZNk4sOA" + import os - updater = KavitaPersonUpdater(KAVITA_URL, KAVITA_KEY) + client = KavitaClient(os.environ["KAVITA_URL"], + os.environ["KAVITA_API_KEY"]) + updater = KavitaPersonUpdater(client) mal = MALResolver() mal_id = mal.find_mal_id("よふかしのうた") diff --git a/src/LightNovelMetadataBuilder.py b/src/LightNovelMetadataBuilder.py index 9ba677f..3a89e02 100644 --- a/src/LightNovelMetadataBuilder.py +++ b/src/LightNovelMetadataBuilder.py @@ -26,6 +26,7 @@ from MangaBakaRateLimit import apply_to_session as _apply_mangabaka_rate_limit from MALResolver import MALResolver from AniListResolver import AniListResolver from MatchesCache import MatchesCache +from TextUtils import paragraphs_to_html # MangaBaka series type for the search endpoint. @@ -92,12 +93,7 @@ def _md_to_html(text: str) -> str: ) text = re.sub(r'\*\*(.+?)\*\*', r'\1', text, flags=re.DOTALL) text = re.sub(r'\*(.+?)\*', r'\1', text, flags=re.DOTALL) - parts: list[str] = [] - for para in re.split(r'\n{2,}', text.strip()): - para = para.strip() - if para: - parts.append(f"

{para.replace(chr(10), '
')}

") - return "".join(parts) + return paragraphs_to_html(text) def pick_cover_url(cover) -> "str | None": @@ -220,16 +216,25 @@ class LightNovelMetadataBuilder: return data[0] if data else None def fetch_series(self, series_id) -> "dict | None": - """Returns the full MangaBaka series dict for the given id.""" + """ + Returns the full MangaBaka series dict for the given id, following + ``merged_with`` redirects. A seen-set guards against merge cycles. + """ if series_id is None or str(series_id).strip() == "": return None - url = f"{self.api_base_url}/series/{series_id}" - resp = self._session.get(url, timeout=self.request_timeout) - resp.raise_for_status() - data = resp.json().get("data") - if data and data.get("state") == "merged" and data.get("merged_with"): - return self.fetch_series(data["merged_with"]) - return data + seen: set[str] = set() + current = series_id + while str(current) not in seen: + seen.add(str(current)) + url = f"{self.api_base_url}/series/{current}" + resp = self._session.get(url, timeout=self.request_timeout) + resp.raise_for_status() + data = resp.json().get("data") + if data and data.get("state") == "merged" and data.get("merged_with"): + current = data["merged_with"] + continue + return data + return None # ------------------------------------------------------------------ # Resolve title -> MangaBaka series (caches the match) @@ -316,14 +321,12 @@ class LightNovelMetadataBuilder: # text-only novels). cover_artists = list(md.get("artists") or []) - # Publisher: prefer English licence, else original - publishers = self._publishers_by_type(md, "English") \ - or self._publishers_by_type(md, "Original") - imprint = None - if self._publishers_by_type(md, "English") and \ - self._publishers_by_type(md, "Original"): - imprint = self._publishers_by_type(md, "Original")[0] if \ - self._publishers_by_type(md, "Original") else None + # Publisher: prefer English licence, else original. When both + # exist, the original publisher becomes the imprint. + english_pubs = self._publishers_by_type(md, "English") + original_pubs = self._publishers_by_type(md, "Original") + publishers = english_pubs or original_pubs + imprint = original_pubs[0] if english_pubs and original_pubs else None # Release year release_year = None diff --git a/src/LightNovelOrchestrator.py b/src/LightNovelOrchestrator.py index 376c4c4..bbb10e3 100644 --- a/src/LightNovelOrchestrator.py +++ b/src/LightNovelOrchestrator.py @@ -71,10 +71,9 @@ class LightNovelOrchestrator: ) self._series_updater = KavitaSeriesUpdater(self._client) self._person_updater = KavitaPersonUpdater( - kavita_url, kavita_api_key, + self._client, mal_resolver=self._mal, al_resolver=self._al, - request_timeout=request_timeout, ) self._relation_sync = RelationshipSync( self._client, matches_cache, builder=self._builder) diff --git a/src/MALResolver.py b/src/MALResolver.py index b038704..5934823 100644 --- a/src/MALResolver.py +++ b/src/MALResolver.py @@ -30,12 +30,12 @@ Dependencies from __future__ import annotations import datetime -import difflib import time import requests from MediaResolver import MediaResolver +from TextUtils import best_similarity class MALResolver(MediaResolver): @@ -404,19 +404,12 @@ def _clean_mal_name(name: str) -> str: def _score_title(query: str, entry: dict) -> float: """Returns the best title-similarity score for a Jikan manga entry.""" candidates = [ - entry.get("title") or "", - entry.get("title_english") or "", - entry.get("title_japanese") or "", + entry.get("title"), + entry.get("title_english"), + entry.get("title_japanese"), ] - for alt in (entry.get("titles") or []): - candidates.append(alt.get("title") or "") - best = 0.0 - q = query.lower() - for t in candidates: - if t: - ratio = difflib.SequenceMatcher(None, q, t.lower()).ratio() - best = max(best, ratio) - return best + candidates += [alt.get("title") for alt in (entry.get("titles") or [])] + return best_similarity(query, candidates) # -------------------------------------------------------------------------- diff --git a/src/MatchesCache.py b/src/MatchesCache.py index ff79cae..c7f677f 100644 --- a/src/MatchesCache.py +++ b/src/MatchesCache.py @@ -36,6 +36,14 @@ import time from pathlib import Path +def _set_int(entry: dict, key: str, value) -> None: + """Sets entry[key] = int(value); ignores values that don't coerce.""" + try: + entry[key] = int(value) + except (TypeError, ValueError): + pass + + class MatchesCache: def __init__(self, path): self._path = Path(path) @@ -100,25 +108,13 @@ class MatchesCache: if image_url is not None: entry["imageUrl"] = image_url if kavita_series_id is not None: - try: - entry["kavitaSeriesId"] = int(kavita_series_id) - except (TypeError, ValueError): - pass + _set_int(entry, "kavitaSeriesId", kavita_series_id) if library_id is not None: - try: - entry["libraryId"] = int(library_id) - except (TypeError, ValueError): - pass + _set_int(entry, "libraryId", library_id) if first_match_time is not None: - try: - entry["firstMatchTime"] = int(first_match_time) - except (TypeError, ValueError): - pass + _set_int(entry, "firstMatchTime", first_match_time) if last_update_time is not None: - try: - entry["lastUpdateTime"] = int(last_update_time) - except (TypeError, ValueError): - pass + _set_int(entry, "lastUpdateTime", last_update_time) self._save_unlocked() return dict(entry) diff --git a/src/MatchesWebApp.py b/src/MatchesWebApp.py index 49cbcab..cb70ef1 100644 --- a/src/MatchesWebApp.py +++ b/src/MatchesWebApp.py @@ -39,6 +39,19 @@ from MatchesCache import MatchesCache from LightNovelMetadataBuilder import pick_thumbnail_url +def _int_list(values) -> list[int]: + """Coerces an iterable of mixed values to a list of positive ints.""" + out: list[int] = [] + for v in (values or []): + try: + n = int(v) + except (TypeError, ValueError): + continue + if n > 0: + out.append(n) + return out + + _INDEX_HTML = r""" @@ -628,7 +641,7 @@ class MatchesWebApp: @app.get("/api/matches") def api_list(): raw = request.args.get("libraryIds") or "" - lib_ids = [int(p) for p in raw.split(",") if p.strip().isdigit()] + lib_ids = _int_list(raw.split(",")) if lib_ids: return jsonify(cache.all_in_libraries(lib_ids)) return jsonify(cache.all()) @@ -680,8 +693,7 @@ class MatchesWebApp: if self._orchestrator is None: return Response("no orchestrator configured", status=503) body = request.get_json(silent=True) or {} - library_ids = [int(i) for i in (body.get("libraryIds") or []) - if str(i).strip().lstrip("-").isdigit()] + library_ids = _int_list(body.get("libraryIds")) if not library_ids: return Response("libraryIds required", status=400) @@ -720,12 +732,7 @@ class MatchesWebApp: return Response("no orchestrator configured", status=503) body = request.get_json(silent=True) or {} raw = body.get("libraryIds") - library_ids: "list[int] | None" - if raw is None: - library_ids = None - else: - library_ids = [int(i) for i in raw - if str(i).strip().lstrip("-").isdigit()] + library_ids = None if raw is None else _int_list(raw) label = ("update all (every library)" if library_ids is None else f"update all in libraries {library_ids}") diff --git a/src/TextUtils.py b/src/TextUtils.py new file mode 100644 index 0000000..1924b71 --- /dev/null +++ b/src/TextUtils.py @@ -0,0 +1,45 @@ +""" +text_utils.py +============= + +Small text helpers shared across modules: + +* ``paragraphs_to_html`` — converts plain text with blank-line paragraph + breaks into compact HTML (used for Kavita summary / description fields, + which must not contain raw newlines). +* ``best_similarity`` — best difflib ratio between a query string and a + list of candidate strings (used for title / person-name matching). +""" + +from __future__ import annotations + +import difflib +import re +from typing import Iterable + + +def paragraphs_to_html(text: str) -> str: + """Converts plain text with paragraph breaks to compact HTML (no raw \\n).""" + if not text: + return "" + parts: list[str] = [] + for para in re.split(r"\n{2,}", text.strip()): + para = para.strip() + if para: + parts.append(f"

{para.replace(chr(10), '
')}

") + return "".join(parts) + + +def best_similarity(query: str, candidates: Iterable[str]) -> float: + """ + Returns the best case-insensitive difflib similarity ratio (0..1) + between `query` and any non-empty candidate. + """ + q = (query or "").lower() + best = 0.0 + for candidate in candidates: + if candidate: + ratio = difflib.SequenceMatcher( + None, q, str(candidate).lower()).ratio() + best = max(best, ratio) + return best