Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions backend/core/http_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Shared HTTP client configuration.

A single home for the outbound User-Agent and a `requests.Session` factory,
so every crawler/integration identifies us the same way and a version bump
is a one-line change rather than a sweep across ~20 files.

Most external services (MusicBrainz, Wikipedia/MediaWiki, Cover Art Archive,
Wikimedia Commons, etc.) expect — and in some cases require — a descriptive
User-Agent. Use `make_session()` to get a session that already carries it.

Note: this only handles identification/headers, not per-service rate
limiting. Clients that must throttle (e.g. MusicBrainz) keep their own
rate-limit logic on top of the session.
"""

import requests

# Outbound identity sent on every API/crawl request. Bump the version here.
HTTP_USER_AGENT = "ApproachNote/1.0 (+support@approachnote.com)"


def make_session(accept_json: bool = True) -> requests.Session:
"""Return a requests.Session preconfigured with our User-Agent.

Args:
accept_json: Also set ``Accept: application/json`` (the common case
for the JSON APIs we call). Pass False for HTML/binary fetches.
"""
session = requests.Session()
headers = {'User-Agent': HTTP_USER_AGENT}
if accept_json:
headers['Accept'] = 'application/json'
session.headers.update(headers)
return session
9 changes: 9 additions & 0 deletions backend/core/song_research.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from integrations.musicbrainz.release_importer import MBReleaseImporter
from db_utils import get_db_connection, execute_query
from integrations.musicbrainz.utils import MusicBrainzSearcher, update_song_composer, update_song_wikipedia_url, update_song_composed_year
from integrations.wikipedia.song_intro import update_song_wikipedia_intro
from core import research_queue, research_jobs
from core import performer_reference_verification
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -217,6 +218,14 @@ def progress_callback(phase: str, current: int, total: int):
if not composed_year_updated:
logger.debug("Composed year not updated (already set or not found)")

# Step 1.8: Pull the Wikipedia intro into songs.structure. Runs after
# Step 1.6 so it can consume the wikipedia_url just resolved off the
# MB work. Idempotent unless force_refresh — see update_song_wikipedia_intro.
logger.info("Checking for Wikipedia intro update...")
intro_updated = update_song_wikipedia_intro(str(song_id), force_refresh=force_refresh)
if not intro_updated:
logger.debug("Wikipedia intro not updated (already set, no URL, or not found)")

# Spotify, Apple Music, and YouTube matching all run on the
# durable research queue (research_worker/handlers/*). Their
# per-job stats live on the research_jobs row's `result` field —
Expand Down
171 changes: 171 additions & 0 deletions backend/integrations/wikipedia/song_intro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
"""Wikipedia song-intro fetcher + updater.

Pulls the lead-section extract for a song's Wikipedia article — the
plain-text intro shown in the app's song detail — and writes it into
songs.structure.

This is the reusable, pipeline-wired version of the one-time backfill in
scripts/onetime_scripts/one_time_song_wiki_intro.py. The backfill populated
existing rows once; this module is called from core.song_research so every
newly imported / refreshed song with a wikipedia_url gets its intro pulled
in too (the import path previously set wikipedia_url but never the intro).

It lives under integrations/wikipedia (not the MusicBrainz updaters in
integrations/musicbrainz/song_updates.py) because it talks to the MediaWiki
extracts API, not MusicBrainz — it consumes the wikipedia_url that the MB
updater has already resolved onto the song.
"""

import logging
from urllib.parse import unquote, urlparse

import requests

from core.http_client import make_session

logger = logging.getLogger(__name__)

DEFAULT_SENTENCES = 4
REQUEST_TIMEOUT = 15


def parse_wikipedia_url(wikipedia_url: str):
"""Return (api_url, page_title) for a Wikipedia article URL, or (None, None).

Honors the language subdomain so de.wikipedia.org URLs hit the right API.
"""
try:
parsed = urlparse(wikipedia_url)
if not parsed.netloc or '/wiki/' not in parsed.path:
return None, None
title = parsed.path.split('/wiki/', 1)[1]
title = title.split('#', 1)[0]
title = unquote(title)
if not title:
return None, None
api_url = f"{parsed.scheme}://{parsed.netloc}/w/api.php"
return api_url, title
except Exception:
return None, None


def fetch_wikipedia_intro(page_title: str, api_url: str,
sentences: int = DEFAULT_SENTENCES,
session: requests.Session = None):
"""Fetch the lead-section plain-text extract for a Wikipedia page.

Returns the extract string, or None if the page is missing / empty / the
request fails. Raises nothing for HTTP-level non-200s (logs + returns
None); network exceptions propagate to the caller.
"""
sess = session or make_session()
params = {
'action': 'query',
'format': 'json',
'prop': 'extracts',
'titles': page_title,
'redirects': 1,
'exintro': 1,
'explaintext': 1,
'exsentences': sentences,
}
resp = sess.get(api_url, params=params, timeout=REQUEST_TIMEOUT)
if resp.status_code != 200:
logger.warning("Wikipedia returned status %s for %s", resp.status_code, page_title)
return None
pages = resp.json().get('query', {}).get('pages', {})
if not pages:
return None
page = next(iter(pages.values()))
if 'missing' in page:
logger.warning("Wikipedia page missing: %s", page_title)
return None
extract = (page.get('extract') or '').strip()
return extract or None


def update_song_wikipedia_intro(song_id: str,
sentences: int = DEFAULT_SENTENCES,
force_refresh: bool = False,
dry_run: bool = False) -> bool:
"""Populate songs.structure with the song's Wikipedia intro.

Reads the wikipedia_url already on the song (set earlier in the research
pipeline by update_song_wikipedia_url), fetches the lead-section extract,
and stores it in songs.structure.

Idempotent like the sibling MB updaters: skips a song that already has
structure text, UNLESS force_refresh is set — a deep refresh re-pulls
the intro so edits/expansions on Wikipedia flow through.

Args:
song_id: UUID of the song
sentences: Number of intro sentences to request from MediaWiki
force_refresh: Overwrite existing structure text if True
dry_run: Log what would happen without writing to the DB

Returns:
bool: True if structure was updated (or would be, in dry-run).
"""
from db_utils import get_db_connection

try:
with get_db_connection() as conn:
with conn.cursor() as cur:
cur.execute(
"SELECT wikipedia_url, structure, title FROM songs WHERE id = %s",
(song_id,),
)
row = cur.fetchone()

if not row:
return False

wikipedia_url = row['wikipedia_url']
current_structure = row['structure']
song_title = row['title']

if not wikipedia_url:
logger.debug("Song has no Wikipedia URL, skipping intro update")
return False

# Don't clobber an existing intro unless explicitly refreshing.
if (current_structure or '').strip() and not force_refresh:
logger.debug("Song '%s' already has intro text, skipping", song_title)
return False

api_url, page_title = parse_wikipedia_url(wikipedia_url)
if not api_url:
logger.warning("Could not parse Wikipedia URL for '%s': %s",
song_title, wikipedia_url)
return False

intro = fetch_wikipedia_intro(page_title, api_url, sentences=sentences)
if not intro:
logger.debug("No Wikipedia intro returned for '%s'", song_title)
return False

if dry_run:
logger.info("[DRY RUN] Would update intro for '%s' (%d chars)",
song_title, len(intro))
return True

with get_db_connection() as conn:
with conn.cursor() as cur:
cur.execute(
"UPDATE songs SET structure = %s, updated_at = CURRENT_TIMESTAMP "
"WHERE id = %s",
(intro, song_id),
)
conn.commit()

logger.info("✓ Updated Wikipedia intro for '%s' (%d chars)",
song_title, len(intro))
return True

except requests.RequestException as e:
logger.error("Wikipedia request error updating intro for song %s: %s", song_id, e)
return False
except Exception as e:
logger.error("Error updating Wikipedia intro for song %s: %s", song_id, e)
return False
130 changes: 130 additions & 0 deletions backend/routes/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4434,6 +4434,136 @@ def songs_browse_detail(song_id):
)


def _is_uuid(value):
"""True if `value` parses as a UUID (MB work IDs are UUIDs)."""
import uuid as _uuid
try:
_uuid.UUID(str(value))
return True
except (ValueError, AttributeError, TypeError):
return False


@admin_bp.route('/musicbrainz/work/<work_id>/lookup', methods=['GET'])
def musicbrainz_work_lookup(work_id):
"""Look up a MusicBrainz work by ID and return its title + creators.

Backs the inline MB Work ID editor on the song detail page: the admin
types an ID, we fetch the work from MusicBrainz (cached via
MusicBrainzSearcher) and echo back the canonical title and composer/
writer/lyricist credits so the change can be eyeballed before saving.
"""
work_id = (work_id or '').strip()
if not _is_uuid(work_id):
return jsonify({'error': 'Not a valid MusicBrainz work ID (expected a UUID).'}), 400

try:
work_data = MusicBrainzSearcher().get_work_recordings(work_id)
except Exception as e:
logger.error("MB work lookup failed for %s: %s", work_id, e)
return jsonify({'error': 'MusicBrainz lookup failed. Try again.'}), 502

if not work_data:
return jsonify({'error': 'No MusicBrainz work found for that ID.'}), 404

# Pull composer/writer/lyricist credits off the artist relations, in the
# same way song_updates.update_song_composer does, preserving order and
# de-duplicating by name.
creators = []
seen = set()
for relation in work_data.get('relations', []):
rel_type = relation.get('type')
if rel_type in ('composer', 'writer', 'lyricist'):
name = (relation.get('artist') or {}).get('name')
if name and name not in seen:
seen.add(name)
creators.append({'name': name, 'type': rel_type})

return jsonify({
'id': work_data.get('id') or work_id,
'title': work_data.get('title'),
'composers': creators,
})


@admin_bp.route('/songs/<song_id>/mb-id', methods=['POST'])
def songs_update_mb_id(song_id):
"""Set or clear a song's primary or secondary MusicBrainz work ID.

Body (JSON):
slot: 'primary' | 'second' (required)
mb_id: UUID string, or '' / null to clear the slot.
"""
body = request.get_json(silent=True) or {}
slot = (body.get('slot') or '').strip()
column = {'primary': 'musicbrainz_id', 'second': 'second_mb_id'}.get(slot)
if not column:
return jsonify({'error': "slot must be 'primary' or 'second'"}), 400

raw = body.get('mb_id')
mb_id = (raw or '').strip() or None
if mb_id is not None and not _is_uuid(mb_id):
return jsonify({'error': 'Not a valid MusicBrainz work ID (expected a UUID).'}), 400

with get_db_connection() as conn:
with conn.cursor() as cur:
cur.execute(
f"UPDATE songs SET {column} = %s, updated_at = CURRENT_TIMESTAMP "
"WHERE id = %s RETURNING id",
(mb_id, song_id),
)
if cur.fetchone() is None:
conn.rollback()
return jsonify({'error': 'Song not found'}), 404
conn.commit()

logger.info("admin set %s=%s on song %s", column, mb_id, song_id)
return jsonify({'success': True, 'slot': slot, 'mb_id': mb_id})


@admin_bp.route('/songs/<song_id>/alt-titles', methods=['POST'])
def songs_update_alt_titles(song_id):
"""Replace a song's alternate-title list (songs.alt_titles TEXT[]).

Body (JSON): { alt_titles: ["...", ...] }. Entries are trimmed, blanks
dropped, duplicates removed (case-insensitive, first spelling wins). An
empty list clears the column to NULL.
"""
body = request.get_json(silent=True) or {}
raw = body.get('alt_titles')
if not isinstance(raw, list):
return jsonify({'error': 'alt_titles must be a list of strings'}), 400

cleaned = []
seen = set()
for item in raw:
title = (item or '').strip() if isinstance(item, str) else ''
if not title:
continue
key = title.lower()
if key in seen:
continue
seen.add(key)
cleaned.append(title)

stored = cleaned or None # empty list -> NULL

with get_db_connection() as conn:
with conn.cursor() as cur:
cur.execute(
"UPDATE songs SET alt_titles = %s, updated_at = CURRENT_TIMESTAMP "
"WHERE id = %s RETURNING id",
(stored, song_id),
)
if cur.fetchone() is None:
conn.rollback()
return jsonify({'error': 'Song not found'}), 404
conn.commit()

logger.info("admin set alt_titles=%s on song %s", cleaned, song_id)
return jsonify({'success': True, 'alt_titles': cleaned})


@admin_bp.route('/releases/<release_id>')
def releases_browse_detail(release_id):
"""Release detail.
Expand Down
Loading
Loading