dprodger · dprodger · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/backend/core/http_client.py b/backend/core/http_client.py
@@ -0,0 +1,34 @@
+"""Shared HTTP client configuration.
+
+A single home for the outbound User-Agent and a `requests.Session` factory,
+so every crawler/integration identifies us the same way and a version bump
+is a one-line change rather than a sweep across ~20 files.
+
+Most external services (MusicBrainz, Wikipedia/MediaWiki, Cover Art Archive,
+Wikimedia Commons, etc.) expect — and in some cases require — a descriptive
+User-Agent. Use `make_session()` to get a session that already carries it.
+
+Note: this only handles identification/headers, not per-service rate
+limiting. Clients that must throttle (e.g. MusicBrainz) keep their own
+rate-limit logic on top of the session.
+"""
+
+import requests
+
+# Outbound identity sent on every API/crawl request. Bump the version here.
+HTTP_USER_AGENT = "ApproachNote/1.0 (+support@approachnote.com)"
+
+
+def make_session(accept_json: bool = True) -> requests.Session:
+    """Return a requests.Session preconfigured with our User-Agent.
+
+    Args:
+        accept_json: Also set ``Accept: application/json`` (the common case
+            for the JSON APIs we call). Pass False for HTML/binary fetches.
+    """
+    session = requests.Session()
+    headers = {'User-Agent': HTTP_USER_AGENT}
+    if accept_json:
+        headers['Accept'] = 'application/json'
+    session.headers.update(headers)
+    return session
diff --git a/backend/core/song_research.py b/backend/core/song_research.py
@@ -23,6 +23,7 @@
 from integrations.musicbrainz.release_importer import MBReleaseImporter
 from db_utils import get_db_connection, execute_query
 from integrations.musicbrainz.utils import MusicBrainzSearcher, update_song_composer, update_song_wikipedia_url, update_song_composed_year
+from integrations.wikipedia.song_intro import update_song_wikipedia_intro
 from core import research_queue, research_jobs
 from core import performer_reference_verification
 logger = logging.getLogger(__name__)
@@ -217,6 +218,14 @@ def progress_callback(phase: str, current: int, total: int):
         if not composed_year_updated:
             logger.debug("Composed year not updated (already set or not found)")
 
+        # Step 1.8: Pull the Wikipedia intro into songs.structure. Runs after
+        # Step 1.6 so it can consume the wikipedia_url just resolved off the
+        # MB work. Idempotent unless force_refresh — see update_song_wikipedia_intro.
+        logger.info("Checking for Wikipedia intro update...")
+        intro_updated = update_song_wikipedia_intro(str(song_id), force_refresh=force_refresh)
+        if not intro_updated:
+            logger.debug("Wikipedia intro not updated (already set, no URL, or not found)")
+
         # Spotify, Apple Music, and YouTube matching all run on the
         # durable research queue (research_worker/handlers/*). Their
         # per-job stats live on the research_jobs row's `result` field —

diff --git a/backend/integrations/wikipedia/song_intro.py b/backend/integrations/wikipedia/song_intro.py
@@ -0,0 +1,171 @@
+"""Wikipedia song-intro fetcher + updater.
+
+Pulls the lead-section extract for a song's Wikipedia article — the
+plain-text intro shown in the app's song detail — and writes it into
+songs.structure.
+
+This is the reusable, pipeline-wired version of the one-time backfill in
+scripts/onetime_scripts/one_time_song_wiki_intro.py. The backfill populated
+existing rows once; this module is called from core.song_research so every
+newly imported / refreshed song with a wikipedia_url gets its intro pulled
+in too (the import path previously set wikipedia_url but never the intro).
+
+It lives under integrations/wikipedia (not the MusicBrainz updaters in
+integrations/musicbrainz/song_updates.py) because it talks to the MediaWiki
+extracts API, not MusicBrainz — it consumes the wikipedia_url that the MB
+updater has already resolved onto the song.
+"""
+
+import logging
+from urllib.parse import unquote, urlparse
+
+import requests
+
+from core.http_client import make_session
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_SENTENCES = 4
+REQUEST_TIMEOUT = 15
+
+
+def parse_wikipedia_url(wikipedia_url: str):
+    """Return (api_url, page_title) for a Wikipedia article URL, or (None, None).
+
+    Honors the language subdomain so de.wikipedia.org URLs hit the right API.
+    """
+    try:
+        parsed = urlparse(wikipedia_url)
+        if not parsed.netloc or '/wiki/' not in parsed.path:
+            return None, None
+        title = parsed.path.split('/wiki/', 1)[1]
+        title = title.split('#', 1)[0]
+        title = unquote(title)
+        if not title:
+            return None, None
+        api_url = f"{parsed.scheme}://{parsed.netloc}/w/api.php"
+        return api_url, title
+    except Exception:
+        return None, None
+
+
+def fetch_wikipedia_intro(page_title: str, api_url: str,
+                          sentences: int = DEFAULT_SENTENCES,
+                          session: requests.Session = None):
+    """Fetch the lead-section plain-text extract for a Wikipedia page.
+
+    Returns the extract string, or None if the page is missing / empty / the
+    request fails. Raises nothing for HTTP-level non-200s (logs + returns
+    None); network exceptions propagate to the caller.
+    """
+    sess = session or make_session()
+    params = {
+        'action': 'query',
+        'format': 'json',
+        'prop': 'extracts',
+        'titles': page_title,
+        'redirects': 1,
+        'exintro': 1,
+        'explaintext': 1,
+        'exsentences': sentences,
+    }
+    resp = sess.get(api_url, params=params, timeout=REQUEST_TIMEOUT)
+    if resp.status_code != 200:
+        logger.warning("Wikipedia returned status %s for %s", resp.status_code, page_title)
+        return None
+    pages = resp.json().get('query', {}).get('pages', {})
+    if not pages:
+        return None
+    page = next(iter(pages.values()))
+    if 'missing' in page:
+        logger.warning("Wikipedia page missing: %s", page_title)
+        return None
+    extract = (page.get('extract') or '').strip()
+    return extract or None
+
+
+def update_song_wikipedia_intro(song_id: str,
+                                sentences: int = DEFAULT_SENTENCES,
+                                force_refresh: bool = False,
+                                dry_run: bool = False) -> bool:
+    """Populate songs.structure with the song's Wikipedia intro.
+
+    Reads the wikipedia_url already on the song (set earlier in the research
+    pipeline by update_song_wikipedia_url), fetches the lead-section extract,
+    and stores it in songs.structure.
+
+    Idempotent like the sibling MB updaters: skips a song that already has
+    structure text, UNLESS force_refresh is set — a deep refresh re-pulls
+    the intro so edits/expansions on Wikipedia flow through.
+
+    Args:
+        song_id: UUID of the song
+        sentences: Number of intro sentences to request from MediaWiki
+        force_refresh: Overwrite existing structure text if True
+        dry_run: Log what would happen without writing to the DB
+
+    Returns:
+        bool: True if structure was updated (or would be, in dry-run).
+    """
+    from db_utils import get_db_connection
+
+    try:
+        with get_db_connection() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT wikipedia_url, structure, title FROM songs WHERE id = %s",
+                    (song_id,),
+                )
+                row = cur.fetchone()
+
+        if not row:
+            return False
+
+        wikipedia_url = row['wikipedia_url']
+        current_structure = row['structure']
+        song_title = row['title']
+
+        if not wikipedia_url:
+            logger.debug("Song has no Wikipedia URL, skipping intro update")
+            return False
+
+        # Don't clobber an existing intro unless explicitly refreshing.
+        if (current_structure or '').strip() and not force_refresh:
+            logger.debug("Song '%s' already has intro text, skipping", song_title)
+            return False
+
+        api_url, page_title = parse_wikipedia_url(wikipedia_url)
+        if not api_url:
+            logger.warning("Could not parse Wikipedia URL for '%s': %s",
+                           song_title, wikipedia_url)
+            return False
+
+        intro = fetch_wikipedia_intro(page_title, api_url, sentences=sentences)
+        if not intro:
+            logger.debug("No Wikipedia intro returned for '%s'", song_title)
+            return False
+
+        if dry_run:
+            logger.info("[DRY RUN] Would update intro for '%s' (%d chars)",
+                        song_title, len(intro))
+            return True
+
+        with get_db_connection() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "UPDATE songs SET structure = %s, updated_at = CURRENT_TIMESTAMP "
+                    "WHERE id = %s",
+                    (intro, song_id),
+                )
+                conn.commit()
+
+        logger.info("✓ Updated Wikipedia intro for '%s' (%d chars)",
+                    song_title, len(intro))
+        return True
+
+    except requests.RequestException as e:
+        logger.error("Wikipedia request error updating intro for song %s: %s", song_id, e)
+        return False
+    except Exception as e:
+        logger.error("Error updating Wikipedia intro for song %s: %s", song_id, e)
+        return False
diff --git a/backend/routes/admin.py b/backend/routes/admin.py
@@ -4434,6 +4434,136 @@ def songs_browse_detail(song_id):
     )
 
 
+def _is_uuid(value):
+    """True if `value` parses as a UUID (MB work IDs are UUIDs)."""
+    import uuid as _uuid
+    try:
+        _uuid.UUID(str(value))
+        return True
+    except (ValueError, AttributeError, TypeError):
+        return False
+
+
+@admin_bp.route('/musicbrainz/work/<work_id>/lookup', methods=['GET'])
+def musicbrainz_work_lookup(work_id):
+    """Look up a MusicBrainz work by ID and return its title + creators.
+
+    Backs the inline MB Work ID editor on the song detail page: the admin
+    types an ID, we fetch the work from MusicBrainz (cached via
+    MusicBrainzSearcher) and echo back the canonical title and composer/
+    writer/lyricist credits so the change can be eyeballed before saving.
+    """
+    work_id = (work_id or '').strip()
+    if not _is_uuid(work_id):
+        return jsonify({'error': 'Not a valid MusicBrainz work ID (expected a UUID).'}), 400
+
+    try:
+        work_data = MusicBrainzSearcher().get_work_recordings(work_id)
+    except Exception as e:
+        logger.error("MB work lookup failed for %s: %s", work_id, e)
+        return jsonify({'error': 'MusicBrainz lookup failed. Try again.'}), 502
+
+    if not work_data:
+        return jsonify({'error': 'No MusicBrainz work found for that ID.'}), 404
+
+    # Pull composer/writer/lyricist credits off the artist relations, in the
+    # same way song_updates.update_song_composer does, preserving order and
+    # de-duplicating by name.
+    creators = []
+    seen = set()
+    for relation in work_data.get('relations', []):
+        rel_type = relation.get('type')
+        if rel_type in ('composer', 'writer', 'lyricist'):
+            name = (relation.get('artist') or {}).get('name')
+            if name and name not in seen:
+                seen.add(name)
+                creators.append({'name': name, 'type': rel_type})
+
+    return jsonify({
+        'id': work_data.get('id') or work_id,
+        'title': work_data.get('title'),
+        'composers': creators,
+    })
+
+
+@admin_bp.route('/songs/<song_id>/mb-id', methods=['POST'])
+def songs_update_mb_id(song_id):
+    """Set or clear a song's primary or secondary MusicBrainz work ID.
+
+    Body (JSON):
+        slot:  'primary' | 'second' (required)
+        mb_id: UUID string, or '' / null to clear the slot.
+    """
+    body = request.get_json(silent=True) or {}
+    slot = (body.get('slot') or '').strip()
+    column = {'primary': 'musicbrainz_id', 'second': 'second_mb_id'}.get(slot)
+    if not column:
+        return jsonify({'error': "slot must be 'primary' or 'second'"}), 400
+
+    raw = body.get('mb_id')
+    mb_id = (raw or '').strip() or None
+    if mb_id is not None and not _is_uuid(mb_id):
+        return jsonify({'error': 'Not a valid MusicBrainz work ID (expected a UUID).'}), 400
+
+    with get_db_connection() as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                f"UPDATE songs SET {column} = %s, updated_at = CURRENT_TIMESTAMP "
+                "WHERE id = %s RETURNING id",
+                (mb_id, song_id),
+            )
+            if cur.fetchone() is None:
+                conn.rollback()
+                return jsonify({'error': 'Song not found'}), 404
+        conn.commit()
+
+    logger.info("admin set %s=%s on song %s", column, mb_id, song_id)
+    return jsonify({'success': True, 'slot': slot, 'mb_id': mb_id})
+
+
+@admin_bp.route('/songs/<song_id>/alt-titles', methods=['POST'])
+def songs_update_alt_titles(song_id):
+    """Replace a song's alternate-title list (songs.alt_titles TEXT[]).
+
+    Body (JSON): { alt_titles: ["...", ...] }. Entries are trimmed, blanks
+    dropped, duplicates removed (case-insensitive, first spelling wins). An
+    empty list clears the column to NULL.
+    """
+    body = request.get_json(silent=True) or {}
+    raw = body.get('alt_titles')
+    if not isinstance(raw, list):
+        return jsonify({'error': 'alt_titles must be a list of strings'}), 400
+
+    cleaned = []
+    seen = set()
+    for item in raw:
+        title = (item or '').strip() if isinstance(item, str) else ''
+        if not title:
+            continue
+        key = title.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        cleaned.append(title)
+
+    stored = cleaned or None  # empty list -> NULL
+
+    with get_db_connection() as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                "UPDATE songs SET alt_titles = %s, updated_at = CURRENT_TIMESTAMP "
+                "WHERE id = %s RETURNING id",
+                (stored, song_id),
+            )
+            if cur.fetchone() is None:
+                conn.rollback()
+                return jsonify({'error': 'Song not found'}), 404
+        conn.commit()
+
+    logger.info("admin set alt_titles=%s on song %s", cleaned, song_id)
+    return jsonify({'success': True, 'alt_titles': cleaned})
+
+
 @admin_bp.route('/releases/<release_id>')
 def releases_browse_detail(release_id):
     """Release detail.