diff --git a/backend/core/http_client.py b/backend/core/http_client.py new file mode 100644 index 0000000..593a378 --- /dev/null +++ b/backend/core/http_client.py @@ -0,0 +1,34 @@ +"""Shared HTTP client configuration. + +A single home for the outbound User-Agent and a `requests.Session` factory, +so every crawler/integration identifies us the same way and a version bump +is a one-line change rather than a sweep across ~20 files. + +Most external services (MusicBrainz, Wikipedia/MediaWiki, Cover Art Archive, +Wikimedia Commons, etc.) expect — and in some cases require — a descriptive +User-Agent. Use `make_session()` to get a session that already carries it. + +Note: this only handles identification/headers, not per-service rate +limiting. Clients that must throttle (e.g. MusicBrainz) keep their own +rate-limit logic on top of the session. +""" + +import requests + +# Outbound identity sent on every API/crawl request. Bump the version here. +HTTP_USER_AGENT = "ApproachNote/1.0 (+support@approachnote.com)" + + +def make_session(accept_json: bool = True) -> requests.Session: + """Return a requests.Session preconfigured with our User-Agent. + + Args: + accept_json: Also set ``Accept: application/json`` (the common case + for the JSON APIs we call). Pass False for HTML/binary fetches. + """ + session = requests.Session() + headers = {'User-Agent': HTTP_USER_AGENT} + if accept_json: + headers['Accept'] = 'application/json' + session.headers.update(headers) + return session diff --git a/backend/core/song_research.py b/backend/core/song_research.py index d9f4eae..9b6738e 100644 --- a/backend/core/song_research.py +++ b/backend/core/song_research.py @@ -23,6 +23,7 @@ from integrations.musicbrainz.release_importer import MBReleaseImporter from db_utils import get_db_connection, execute_query from integrations.musicbrainz.utils import MusicBrainzSearcher, update_song_composer, update_song_wikipedia_url, update_song_composed_year +from integrations.wikipedia.song_intro import update_song_wikipedia_intro from core import research_queue, research_jobs from core import performer_reference_verification logger = logging.getLogger(__name__) @@ -217,6 +218,14 @@ def progress_callback(phase: str, current: int, total: int): if not composed_year_updated: logger.debug("Composed year not updated (already set or not found)") + # Step 1.8: Pull the Wikipedia intro into songs.structure. Runs after + # Step 1.6 so it can consume the wikipedia_url just resolved off the + # MB work. Idempotent unless force_refresh — see update_song_wikipedia_intro. + logger.info("Checking for Wikipedia intro update...") + intro_updated = update_song_wikipedia_intro(str(song_id), force_refresh=force_refresh) + if not intro_updated: + logger.debug("Wikipedia intro not updated (already set, no URL, or not found)") + # Spotify, Apple Music, and YouTube matching all run on the # durable research queue (research_worker/handlers/*). Their # per-job stats live on the research_jobs row's `result` field — diff --git a/backend/integrations/wikipedia/song_intro.py b/backend/integrations/wikipedia/song_intro.py new file mode 100644 index 0000000..dacbbbe --- /dev/null +++ b/backend/integrations/wikipedia/song_intro.py @@ -0,0 +1,171 @@ +"""Wikipedia song-intro fetcher + updater. + +Pulls the lead-section extract for a song's Wikipedia article — the +plain-text intro shown in the app's song detail — and writes it into +songs.structure. + +This is the reusable, pipeline-wired version of the one-time backfill in +scripts/onetime_scripts/one_time_song_wiki_intro.py. The backfill populated +existing rows once; this module is called from core.song_research so every +newly imported / refreshed song with a wikipedia_url gets its intro pulled +in too (the import path previously set wikipedia_url but never the intro). + +It lives under integrations/wikipedia (not the MusicBrainz updaters in +integrations/musicbrainz/song_updates.py) because it talks to the MediaWiki +extracts API, not MusicBrainz — it consumes the wikipedia_url that the MB +updater has already resolved onto the song. +""" + +import logging +from urllib.parse import unquote, urlparse + +import requests + +from core.http_client import make_session + +logger = logging.getLogger(__name__) + +DEFAULT_SENTENCES = 4 +REQUEST_TIMEOUT = 15 + + +def parse_wikipedia_url(wikipedia_url: str): + """Return (api_url, page_title) for a Wikipedia article URL, or (None, None). + + Honors the language subdomain so de.wikipedia.org URLs hit the right API. + """ + try: + parsed = urlparse(wikipedia_url) + if not parsed.netloc or '/wiki/' not in parsed.path: + return None, None + title = parsed.path.split('/wiki/', 1)[1] + title = title.split('#', 1)[0] + title = unquote(title) + if not title: + return None, None + api_url = f"{parsed.scheme}://{parsed.netloc}/w/api.php" + return api_url, title + except Exception: + return None, None + + +def fetch_wikipedia_intro(page_title: str, api_url: str, + sentences: int = DEFAULT_SENTENCES, + session: requests.Session = None): + """Fetch the lead-section plain-text extract for a Wikipedia page. + + Returns the extract string, or None if the page is missing / empty / the + request fails. Raises nothing for HTTP-level non-200s (logs + returns + None); network exceptions propagate to the caller. + """ + sess = session or make_session() + params = { + 'action': 'query', + 'format': 'json', + 'prop': 'extracts', + 'titles': page_title, + 'redirects': 1, + 'exintro': 1, + 'explaintext': 1, + 'exsentences': sentences, + } + resp = sess.get(api_url, params=params, timeout=REQUEST_TIMEOUT) + if resp.status_code != 200: + logger.warning("Wikipedia returned status %s for %s", resp.status_code, page_title) + return None + pages = resp.json().get('query', {}).get('pages', {}) + if not pages: + return None + page = next(iter(pages.values())) + if 'missing' in page: + logger.warning("Wikipedia page missing: %s", page_title) + return None + extract = (page.get('extract') or '').strip() + return extract or None + + +def update_song_wikipedia_intro(song_id: str, + sentences: int = DEFAULT_SENTENCES, + force_refresh: bool = False, + dry_run: bool = False) -> bool: + """Populate songs.structure with the song's Wikipedia intro. + + Reads the wikipedia_url already on the song (set earlier in the research + pipeline by update_song_wikipedia_url), fetches the lead-section extract, + and stores it in songs.structure. + + Idempotent like the sibling MB updaters: skips a song that already has + structure text, UNLESS force_refresh is set — a deep refresh re-pulls + the intro so edits/expansions on Wikipedia flow through. + + Args: + song_id: UUID of the song + sentences: Number of intro sentences to request from MediaWiki + force_refresh: Overwrite existing structure text if True + dry_run: Log what would happen without writing to the DB + + Returns: + bool: True if structure was updated (or would be, in dry-run). + """ + from db_utils import get_db_connection + + try: + with get_db_connection() as conn: + with conn.cursor() as cur: + cur.execute( + "SELECT wikipedia_url, structure, title FROM songs WHERE id = %s", + (song_id,), + ) + row = cur.fetchone() + + if not row: + return False + + wikipedia_url = row['wikipedia_url'] + current_structure = row['structure'] + song_title = row['title'] + + if not wikipedia_url: + logger.debug("Song has no Wikipedia URL, skipping intro update") + return False + + # Don't clobber an existing intro unless explicitly refreshing. + if (current_structure or '').strip() and not force_refresh: + logger.debug("Song '%s' already has intro text, skipping", song_title) + return False + + api_url, page_title = parse_wikipedia_url(wikipedia_url) + if not api_url: + logger.warning("Could not parse Wikipedia URL for '%s': %s", + song_title, wikipedia_url) + return False + + intro = fetch_wikipedia_intro(page_title, api_url, sentences=sentences) + if not intro: + logger.debug("No Wikipedia intro returned for '%s'", song_title) + return False + + if dry_run: + logger.info("[DRY RUN] Would update intro for '%s' (%d chars)", + song_title, len(intro)) + return True + + with get_db_connection() as conn: + with conn.cursor() as cur: + cur.execute( + "UPDATE songs SET structure = %s, updated_at = CURRENT_TIMESTAMP " + "WHERE id = %s", + (intro, song_id), + ) + conn.commit() + + logger.info("✓ Updated Wikipedia intro for '%s' (%d chars)", + song_title, len(intro)) + return True + + except requests.RequestException as e: + logger.error("Wikipedia request error updating intro for song %s: %s", song_id, e) + return False + except Exception as e: + logger.error("Error updating Wikipedia intro for song %s: %s", song_id, e) + return False diff --git a/backend/routes/admin.py b/backend/routes/admin.py index 3dbe306..e8fd96e 100644 --- a/backend/routes/admin.py +++ b/backend/routes/admin.py @@ -4434,6 +4434,136 @@ def songs_browse_detail(song_id): ) +def _is_uuid(value): + """True if `value` parses as a UUID (MB work IDs are UUIDs).""" + import uuid as _uuid + try: + _uuid.UUID(str(value)) + return True + except (ValueError, AttributeError, TypeError): + return False + + +@admin_bp.route('/musicbrainz/work//lookup', methods=['GET']) +def musicbrainz_work_lookup(work_id): + """Look up a MusicBrainz work by ID and return its title + creators. + + Backs the inline MB Work ID editor on the song detail page: the admin + types an ID, we fetch the work from MusicBrainz (cached via + MusicBrainzSearcher) and echo back the canonical title and composer/ + writer/lyricist credits so the change can be eyeballed before saving. + """ + work_id = (work_id or '').strip() + if not _is_uuid(work_id): + return jsonify({'error': 'Not a valid MusicBrainz work ID (expected a UUID).'}), 400 + + try: + work_data = MusicBrainzSearcher().get_work_recordings(work_id) + except Exception as e: + logger.error("MB work lookup failed for %s: %s", work_id, e) + return jsonify({'error': 'MusicBrainz lookup failed. Try again.'}), 502 + + if not work_data: + return jsonify({'error': 'No MusicBrainz work found for that ID.'}), 404 + + # Pull composer/writer/lyricist credits off the artist relations, in the + # same way song_updates.update_song_composer does, preserving order and + # de-duplicating by name. + creators = [] + seen = set() + for relation in work_data.get('relations', []): + rel_type = relation.get('type') + if rel_type in ('composer', 'writer', 'lyricist'): + name = (relation.get('artist') or {}).get('name') + if name and name not in seen: + seen.add(name) + creators.append({'name': name, 'type': rel_type}) + + return jsonify({ + 'id': work_data.get('id') or work_id, + 'title': work_data.get('title'), + 'composers': creators, + }) + + +@admin_bp.route('/songs//mb-id', methods=['POST']) +def songs_update_mb_id(song_id): + """Set or clear a song's primary or secondary MusicBrainz work ID. + + Body (JSON): + slot: 'primary' | 'second' (required) + mb_id: UUID string, or '' / null to clear the slot. + """ + body = request.get_json(silent=True) or {} + slot = (body.get('slot') or '').strip() + column = {'primary': 'musicbrainz_id', 'second': 'second_mb_id'}.get(slot) + if not column: + return jsonify({'error': "slot must be 'primary' or 'second'"}), 400 + + raw = body.get('mb_id') + mb_id = (raw or '').strip() or None + if mb_id is not None and not _is_uuid(mb_id): + return jsonify({'error': 'Not a valid MusicBrainz work ID (expected a UUID).'}), 400 + + with get_db_connection() as conn: + with conn.cursor() as cur: + cur.execute( + f"UPDATE songs SET {column} = %s, updated_at = CURRENT_TIMESTAMP " + "WHERE id = %s RETURNING id", + (mb_id, song_id), + ) + if cur.fetchone() is None: + conn.rollback() + return jsonify({'error': 'Song not found'}), 404 + conn.commit() + + logger.info("admin set %s=%s on song %s", column, mb_id, song_id) + return jsonify({'success': True, 'slot': slot, 'mb_id': mb_id}) + + +@admin_bp.route('/songs//alt-titles', methods=['POST']) +def songs_update_alt_titles(song_id): + """Replace a song's alternate-title list (songs.alt_titles TEXT[]). + + Body (JSON): { alt_titles: ["...", ...] }. Entries are trimmed, blanks + dropped, duplicates removed (case-insensitive, first spelling wins). An + empty list clears the column to NULL. + """ + body = request.get_json(silent=True) or {} + raw = body.get('alt_titles') + if not isinstance(raw, list): + return jsonify({'error': 'alt_titles must be a list of strings'}), 400 + + cleaned = [] + seen = set() + for item in raw: + title = (item or '').strip() if isinstance(item, str) else '' + if not title: + continue + key = title.lower() + if key in seen: + continue + seen.add(key) + cleaned.append(title) + + stored = cleaned or None # empty list -> NULL + + with get_db_connection() as conn: + with conn.cursor() as cur: + cur.execute( + "UPDATE songs SET alt_titles = %s, updated_at = CURRENT_TIMESTAMP " + "WHERE id = %s RETURNING id", + (stored, song_id), + ) + if cur.fetchone() is None: + conn.rollback() + return jsonify({'error': 'Song not found'}), 404 + conn.commit() + + logger.info("admin set alt_titles=%s on song %s", cleaned, song_id) + return jsonify({'success': True, 'alt_titles': cleaned}) + + @admin_bp.route('/releases/') def releases_browse_detail(release_id): """Release detail. diff --git a/backend/templates/admin/browse_song_detail.html b/backend/templates/admin/browse_song_detail.html index 8b140a7..27f8e7b 100644 --- a/backend/templates/admin/browse_song_detail.html +++ b/backend/templates/admin/browse_song_detail.html @@ -163,6 +163,51 @@ margin-left: 4px; } .empty { color: #888; padding: 24px; text-align: center; background: #fff; border: 1px solid #e0e0e0; border-radius: 8px; } + + /* Inline editors (MB IDs + alt titles) */ + .link-btn { + background: none; border: none; color: #0066cc; cursor: pointer; + font-size: 12px; padding: 0; margin-left: 8px; font-family: inherit; + } + .link-btn:hover { text-decoration: underline; } + .btn { + font-size: 12px; padding: 4px 10px; border-radius: 6px; + border: 1px solid #d0d0d0; background: #fff; cursor: pointer; font-family: inherit; + } + .btn:hover { background: #f5f5f7; } + .btn-primary { background: #0066cc; border-color: #0066cc; color: #fff; } + .btn-primary:hover { background: #0052a3; } + .btn-primary:disabled { background: #b0c4de; border-color: #b0c4de; cursor: not-allowed; } + .btn-danger { color: #cc0000; border-color: #e0b4b4; } + .btn-danger:hover { background: #fdf2f2; } + .mb-input, .alt-input { + font-family: 'SF Mono', Monaco, monospace; font-size: 12px; + padding: 5px 8px; border: 1px solid #ccc; border-radius: 6px; + width: 340px; max-width: 100%; + } + .alt-input { font-family: inherit; } + .mb-editor { margin-top: 8px; } + .editor-row { display: flex; gap: 8px; align-items: center; flex-wrap: wrap; } + .mb-result { margin-top: 8px; font-size: 12px; min-height: 1px; } + .mb-result .ok { color: #1a7f37; font-weight: 600; } + .mb-result .err { color: #cc0000; } + .mb-result dl { display: grid; grid-template-columns: max-content 1fr; gap: 2px 10px; margin-top: 4px; } + .mb-result dt { color: #888; } + .editor-actions { display: flex; gap: 8px; margin-top: 10px; align-items: center; } + + .alt-titles { margin: 2px 0 12px; display: flex; align-items: center; flex-wrap: wrap; gap: 6px; } + .alt-label { color: #888; font-size: 12px; } + .alt-chip { + display: inline-flex; align-items: center; gap: 4px; + background: #eef1f4; border-radius: 12px; padding: 3px 4px 3px 10px; + font-size: 12px; color: #333; + } + .alt-chip button { + background: none; border: none; cursor: pointer; color: #999; + font-size: 15px; line-height: 1; padding: 0 4px; font-family: inherit; + } + .alt-chip button:hover { color: #cc0000; } + [hidden] { display: none !important; } @@ -174,39 +219,74 @@ Song -
+

{{ song.title }}

{% if song.composer %}
{{ song.composer }}
{% endif %} + + {# Alternate titles, editable. JS owns the chip rendering from the + initial array below so display and editor stay in sync. #} +
+ Alt titles: + + +
+ +
Song ID
{{ song.id }}
-
MB Work ID
-
- {% if song.musicbrainz_id %} - {{ song.musicbrainz_id }} - {% else %}{% endif %} -
- - {% if song.second_mb_id %} -
Secondary MB Work ID
+ {# Primary + secondary MB work IDs, each with an inline, + verify-before-save editor (see JS at the bottom). #} + {% for slot, label, value in [ + ('primary', 'MB Work ID', song.musicbrainz_id), + ('second', 'Secondary MB Work ID', song.second_mb_id) + ] %} +
{{ label }}
- {{ song.second_mb_id }} + + {% if value %} + {{ value }} + {% else %}{% endif %} + + +
- {% endif %} + {% endfor %} {% if song.composed_year or song.composed_key %}
Composed
{{ song.composed_year or '' }}{% if song.composed_key %} · key {{ song.composed_key }}{% endif %}
{% endif %} - {% if song.alt_titles %} -
Alt Titles
-
{{ song.alt_titles | join(' · ') }}
- {% endif %} - {% if song.wikipedia_url %}
Wikipedia
{{ song.wikipedia_url }}
@@ -214,6 +294,8 @@

{{ song.title }}

+ +

Recordings ({{ recordings | length }})

Click a column header to sort. Click a row to open the recording. @@ -338,5 +420,202 @@

Recordings ({{ recordings | length }})

}); })(); + +