diff --git a/.gitignore b/.gitignore index 4782d94..928ed09 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,18 @@ spotify_duration_rejections.csv spotify_album_context_audit.csv spotify_orphaned_tracks.csv +# Commons imagery audit artifacts (regenerate via scripts/audit_commons_imagery.py +# + scripts/build_commons_audit_viewer.py). Transient worklists, not source. +commons_imagery_audit_*.csv +commons_imagery_audit_*.html + +# Performer->Wikipedia ground-truth pipeline. The verification *queue* and the +# generated viewer HTML are regenerable scratch; the human-verified ground-truth +# export (performer_wikipedia_groundtruth*.json) is NOT ignored and should be +# committed — it's the authoritative manual dataset. +data/ground_truth/wikipedia_queue_*.json +data/ground_truth/*.html + # Claude Code local state (agent memory, worktrees, per-user settings) .claude/ diff --git a/backend/core/commons_imagery.py b/backend/core/commons_imagery.py index 124188a..e7b19e1 100644 --- a/backend/core/commons_imagery.py +++ b/backend/core/commons_imagery.py @@ -204,71 +204,86 @@ def _to_int(v) -> Optional[int]: # Category resolution # --------------------------------------------------------------------------- -def resolve_commons_category(session: requests.Session, artist_name: str, - wikipedia_url: Optional[str] = None) -> Optional[str]: - """Find the performer's Wikimedia Commons category via Wikidata P373, - falling back to a verified "Category:" guess.""" - qid = None - if wikipedia_url: - m = re.search(r"/wiki/(.+)$", wikipedia_url) - if m: - title = requests.utils.unquote(m.group(1)) - try: - r = session.get(WIKIPEDIA_API, params={ - "action": "query", "format": "json", "titles": title, - "prop": "pageprops", "ppprop": "wikibase_item", - }, timeout=15) - pages = r.json().get("query", {}).get("pages", {}) - page = next(iter(pages.values()), {}) - qid = page.get("pageprops", {}).get("wikibase_item") - except Exception as e: - logger.debug("Wikipedia->QID lookup failed: %s", e) +def _wikidata_claims(session: requests.Session, + qids: List[str]) -> Dict[str, dict]: + """Fetch claims for one or more Wikidata QIDs (single batched request).""" + if not qids: + return {} + try: + r = session.get(WIKIDATA_API, params={ + "action": "wbgetentities", "ids": "|".join(qids), + "format": "json", "props": "claims", + }, timeout=15) + return r.json().get("entities", {}) + except Exception as e: + logger.debug("Wikidata claims lookup failed: %s", e) + return {} - if not qid: - try: - r = session.get(WIKIDATA_API, params={ - "action": "wbsearchentities", "search": artist_name, - "language": "en", "format": "json", "type": "item", "limit": 5, - }, timeout=15) - hits = r.json().get("search", []) - qid = hits[0]["id"] if hits else None - except Exception as e: - logger.debug("Wikidata search failed: %s", e) - if qid: - try: - r = session.get(WIKIDATA_API, params={ - "action": "wbgetentities", "ids": qid, "format": "json", - "props": "claims", - }, timeout=15) - claims = r.json().get("entities", {}).get(qid, {}).get("claims", {}) - p373 = claims.get("P373") - if p373: - cat = p373[0]["mainsnak"]["datavalue"]["value"] - logger.info("Resolved Commons category via Wikidata %s: %s", qid, cat) - return f"Category:{cat}" - except Exception as e: - logger.debug("Wikidata P373 lookup failed: %s", e) +def _commons_category_from_claims(claims: Dict[str, Any]) -> Optional[str]: + """Return the entity's Commons category (P373) as a Category:<…> title.""" + p373 = claims.get("P373") + if not p373: + return None + try: + return f"Category:{p373[0]['mainsnak']['datavalue']['value']}" + except (KeyError, TypeError): + return None - guess = f"Category:{artist_name}" - if _category_exists(session, guess): - logger.info("Using guessed Commons category: %s", guess) - return guess - logger.warning("Could not resolve a Commons category for %r", artist_name) - return None +def resolve_commons_category(session: requests.Session, artist_name: str, + wikipedia_url: Optional[str] = None) -> Optional[str]: + """Resolve the performer's Commons category via their Wikipedia article. + + We deliberately resolve imagery ONLY for performers that have a Wikipedia + URL. The app already does significant work to attach the *correct* Wikipedia + article to a performer, so it is a trusted identity anchor: its canonical + Wikidata item gives us the right Commons category (P373). + + There is NO name-based fallback. Searching Wikidata by a bare name is + unreliable for common names — "Andrew Williams" matches an archaeologist + whose Commons category is full of catalogued coin photos, not a jazz + musician — and a same-name match (even a verified human) silently feeds + unrelated images into the ranking pipeline. A performer without a Wikipedia + link simply gets no Commons imagery; no imagery beats the wrong imagery. + """ + if not wikipedia_url: + logger.info("No Wikipedia URL for %r; skipping Commons imagery", artist_name) + return None + m = re.search(r"/wiki/(.+)$", wikipedia_url) + if not m: + logger.warning("Unparseable Wikipedia URL %r for %r; skipping imagery", + wikipedia_url, artist_name) + return None -def _category_exists(session: requests.Session, category: str) -> bool: + title = requests.utils.unquote(m.group(1)) try: - r = session.get(COMMONS_API, params={ - "action": "query", "format": "json", "titles": category, - "prop": "info", + r = session.get(WIKIPEDIA_API, params={ + "action": "query", "format": "json", "titles": title, + "prop": "pageprops", "ppprop": "wikibase_item", }, timeout=15) pages = r.json().get("query", {}).get("pages", {}) - return all(int(pid) > 0 for pid in pages) - except Exception: - return False + page = next(iter(pages.values()), {}) + qid = page.get("pageprops", {}).get("wikibase_item") + except Exception as e: + logger.debug("Wikipedia->QID lookup failed: %s", e) + return None + + if not qid: + logger.info("Wikipedia article for %r has no Wikidata item; " + "skipping imagery", artist_name) + return None + + claims = _wikidata_claims(session, [qid]).get(qid, {}).get("claims", {}) + cat = _commons_category_from_claims(claims) + if cat: + logger.info("Resolved Commons category via Wikidata %s: %s", qid, cat) + return cat + + logger.info("Wikidata %s for %r has no Commons category (P373); " + "skipping imagery", qid, artist_name) + return None # --------------------------------------------------------------------------- @@ -573,7 +588,13 @@ def analyze_and_rank( # processed one at a time and NOT retained — holding every candidate's # full-res bytes at once is what OOM'd the worker. The few images that get # reranked are re-downloaded in phase 3. - for r in records: + # + # This loop is the slowest silent stretch (a download + decode + analysis + # per candidate, up to max_candidates), so emit periodic progress to make + # it obvious the worker is alive rather than hung. + total = len(records) + logger.info("Phase 1: downloading + gating %d candidate(s)", total) + for i, r in enumerate(records, 1): img_bytes = download(session, r.url) or download(session, r.thumbnail_url) if not img_bytes: r.analysis = {"passed_gate": False, "reasons": ["download failed"], @@ -592,6 +613,8 @@ def analyze_and_rank( vision=None).to_dict() r.quality_score = score del img_bytes # release immediately + if i % 20 == 0 or i == total: + logger.info("Phase 1: analyzed %d/%d candidate(s)", i, total) # Phase 2: gate filter if config.do_gate: diff --git a/backend/core/performer_commons_imagery.py b/backend/core/performer_commons_imagery.py index 1c83f07..26c52e4 100644 --- a/backend/core/performer_commons_imagery.py +++ b/backend/core/performer_commons_imagery.py @@ -38,13 +38,24 @@ def find_candidate_performer_ids(stale_days: int = DEFAULT_STALE_DAYS, limit: Optional[int] = None) -> list[str]: """UUIDs of performers due for an imagery (re)check: never checked, or - last checked more than `stale_days` ago. Newest performers first.""" + last checked more than `stale_days` ago. Newest performers first. + + Restricted to performers that have a Wikipedia URL (column or + external_links.wikipedia). The Commons resolver only trusts a performer's + validated Wikipedia article as an identity anchor — name-based matching is + unreliable for common names — so performers without one would be a guaranteed + no-op. Skipping them here avoids spending worker cycles and vision quota on + jobs that can never add imagery.""" limit_clause = "LIMIT %s" if limit is not None else "" sql = f""" SELECT id FROM performers - WHERE last_imagery_check IS NULL - OR last_imagery_check < now() - make_interval(days => %s) + WHERE (last_imagery_check IS NULL + OR last_imagery_check < now() - make_interval(days => %s)) + AND ( + btrim(COALESCE(wikipedia_url, '')) <> '' + OR btrim(COALESCE(external_links->>'wikipedia', '')) <> '' + ) ORDER BY created_at DESC {limit_clause} """ diff --git a/backend/scripts/audit_commons_imagery.py b/backend/scripts/audit_commons_imagery.py new file mode 100644 index 0000000..6813a66 --- /dev/null +++ b/backend/scripts/audit_commons_imagery.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Audit performer Commons imagery for wrong-category contamination. + +Background: earlier runs of the Commons imagery enricher fell back to a blind +``Category:`` guess when Wikidata had no Commons-category claim. For +common names that picked up an *unrelated* same-named person's category (e.g. +an archaeologist's catalogued coin finds rather than photos of a musician). +The resolver no longer does this — it requires either the performer's own +Wikipedia article or a Wikidata hit verified as a human (P31=Q5) — but images +already linked by the old path are still in the database. + +This script is READ-ONLY. For every performer that currently has at least one +``wikimedia_commons`` image it: + + 1. Re-runs the *current* resolver (core.commons_imagery.resolve_commons_category) + with the same Wikipedia/external-link inputs the worker uses. + 2. Classifies each existing Commons image: + NO_CATEGORY - resolver now returns nothing, so the image could + only have come from the removed guess path. + Whole performer is suspect. + NOT_IN_RESOLVED_CATEGORY - resolver returns a category, but this image's + Commons pageid is NOT a member of it (walked at + the worker's recurse depth of 0). Strong signal + the image came from a different/old category. + OK - image's pageid is in the resolved category. + +Flagged rows are written to a CSV worklist for manual review / cleanup. The +script never deletes anything. + +Usage: + python scripts/audit_commons_imagery.py # full sweep + python scripts/audit_commons_imagery.py --limit 200 + python scripts/audit_commons_imagery.py --name "Andrew Williams" + python scripts/audit_commons_imagery.py --id + python scripts/audit_commons_imagery.py --since 2026-06-09T17:00:00 + python scripts/audit_commons_imagery.py --all -o all_commons.csv +""" + +import csv +from datetime import datetime + +from script_base import ScriptBase, run_script +from db_utils import get_db_connection +from core import commons_imagery as ci + +# Mirror the worker's GatherConfig recurse depth (research_worker/handlers/commons.py +# builds GatherConfig() without overriding recurse_subcats, so it stays 0). +_RECURSE_SUBCATS = ci.GatherConfig().recurse_subcats +_ACCEPTED_LICENSES = list(ci.GatherConfig().licenses) + +_PERFORMERS_WITH_COMMONS_SQL = """ + SELECT + p.id, + p.name, + p.wikipedia_url, + p.external_links, + i.id AS image_id, + i.url AS image_url, + i.source_identifier, + i.source_page_url, + ai.is_primary, + ai.created_at AS linked_at + FROM artist_images ai + JOIN images i ON i.id = ai.image_id + JOIN performers p ON p.id = ai.performer_id + WHERE i.source = 'wikimedia_commons' + {where} + ORDER BY p.name, ai.display_order +""" + + +def _wikipedia_url(row) -> str | None: + """Same precedence the worker uses: explicit column, then external_links.""" + direct = (row.get("wikipedia_url") or "").strip() + if direct: + return direct + links = row.get("external_links") or {} + if isinstance(links, dict): + return (links.get("wikipedia") or "").strip() or None + return None + + +def _load_rows(name=None, performer_id=None, since=None, limit=None): + clauses, params = [], [] + if performer_id: + clauses.append("p.id = %s") + params.append(performer_id) + if name: + clauses.append("LOWER(p.name) = LOWER(%s)") + params.append(name) + if since: + clauses.append("ai.created_at >= %s") + params.append(since) + where = ("AND " + " AND ".join(clauses)) if clauses else "" + sql = _PERFORMERS_WITH_COMMONS_SQL.format(where=where) + if limit: + sql += "\n LIMIT %s" + params.append(limit) + with get_db_connection() as conn: + with conn.cursor() as cur: + cur.execute(sql, tuple(params)) + return cur.fetchall() + + +def _group_by_performer(rows): + """rows -> {performer_id: {"meta": row, "images": [rows]}} preserving order.""" + grouped = {} + for r in rows: + pid = str(r["id"]) + grouped.setdefault(pid, {"meta": r, "images": []})["images"].append(r) + return grouped + + +def _resolved_category_pageids(session, category): + """Return the set of Commons pageids in `category` (as strings).""" + records = ci.fetch_commons_category_files( + session, category, _ACCEPTED_LICENSES, include_nkcr=False, + recurse_subcats=_RECURSE_SUBCATS, + ) + return {str(r.source_identifier) for r in records} + + +def main() -> bool: + script = ScriptBase( + name="audit_commons_imagery", + description="Audit performer Commons imagery for wrong-category contamination", + epilog=__doc__, + ) + group = script.parser.add_mutually_exclusive_group(required=False) + group.add_argument("--name", help="Audit a single performer by name") + group.add_argument("--id", help="Audit a single performer by UUID") + script.parser.add_argument("--since", default=None, + help="Only images linked at/after this ISO timestamp " + "(e.g. 2026-06-09T17:00:00). Useful to focus on " + "a specific enrichment run.") + script.parser.add_argument("--limit", type=int, default=None, + help="Cap the number of image rows scanned") + script.parser.add_argument("--all", action="store_true", + help="Include OK rows in the CSV (default: flagged only)") + script.parser.add_argument("-o", "--output", default=None, + help="Output CSV path (default: " + "commons_imagery_audit_.csv)") + args = script.parse_args() + + script.print_header({ + "SINGLE": args.name or args.id or False, + "SINCE": args.since or False, + "LIMIT": args.limit or False, + "INCLUDE OK": args.all, + }) + + rows = _load_rows(name=args.name, performer_id=args.id, + since=args.since, limit=args.limit) + if not rows: + script.logger.info("No wikimedia_commons images matched the filters.") + return True + + grouped = _group_by_performer(rows) + script.logger.info("Scanning %d image(s) across %d performer(s)", + len(rows), len(grouped)) + + session = ci.make_session() + out_path = args.output or ( + f"commons_imagery_audit_{datetime.now():%Y%m%d_%H%M%S}.csv") + + counts = {"NO_CATEGORY": 0, "NOT_IN_RESOLVED_CATEGORY": 0, "OK": 0} + fieldnames = [ + "performer_id", "performer_name", "verdict", "resolved_category", + "image_id", "image_url", "source_identifier", "source_page_url", + "is_primary", "linked_at", + ] + written = 0 + + with open(out_path, "w", newline="") as fh: + writer = csv.DictWriter(fh, fieldnames=fieldnames) + writer.writeheader() + + for pid, bundle in grouped.items(): + meta = bundle["meta"] + name = meta["name"] + category = ci.resolve_commons_category( + session, name, _wikipedia_url(meta)) + + member_pageids = set() + if category: + try: + member_pageids = _resolved_category_pageids(session, category) + except Exception as e: # network/category hiccup -> don't crash the sweep + script.logger.warning( + "Could not list %s for %s (%s); treating members as unknown", + category, name, e) + + for img in bundle["images"]: + if category is None: + verdict = "NO_CATEGORY" + elif str(img["source_identifier"]) in member_pageids: + verdict = "OK" + else: + # Either the image isn't in the resolved category, or the + # category couldn't be enumerated above (member_pageids + # empty) — both warrant a manual look rather than a pass. + verdict = "NOT_IN_RESOLVED_CATEGORY" + counts[verdict] += 1 + + if verdict == "OK" and not args.all: + continue + writer.writerow({ + "performer_id": pid, + "performer_name": name, + "verdict": verdict, + "resolved_category": category or "", + "image_id": str(img["image_id"]), + "image_url": img["image_url"], + "source_identifier": img["source_identifier"], + "source_page_url": img["source_page_url"] or "", + "is_primary": img["is_primary"], + "linked_at": img["linked_at"].isoformat() if img["linked_at"] else "", + }) + written += 1 + + flagged = counts["NO_CATEGORY"] + counts["NOT_IN_RESOLVED_CATEGORY"] + script.logger.info("Done. flagged=%d (NO_CATEGORY=%d, " + "NOT_IN_RESOLVED_CATEGORY=%d), ok=%d", + flagged, counts["NO_CATEGORY"], + counts["NOT_IN_RESOLVED_CATEGORY"], counts["OK"]) + script.logger.info("Wrote %d row(s) to %s", written, out_path) + return True + + +if __name__ == "__main__": + run_script(main) diff --git a/backend/scripts/build_commons_audit_viewer.py b/backend/scripts/build_commons_audit_viewer.py new file mode 100644 index 0000000..2599ccb --- /dev/null +++ b/backend/scripts/build_commons_audit_viewer.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +""" +Build a self-contained HTML viewer for a commons_imagery_audit CSV. + +The audit CSV (output of audit_commons_imagery.py) lists performer Commons +images flagged for review. NO_CATEGORY means the resolver no longer finds a +Wikipedia-anchored category for that performer — which is a *mix* of genuinely +wrong imagery (a same-named person's photos) and genuinely correct imagery from +performers who simply lack a Wikipedia URL. You can't bulk-delete it; you have +to look. This viewer makes looking fast. + +Output is a single HTML file with the data embedded — no server, no network +except Commons thumbnails. Open it with `open .html`. It lets you: + + - browse images grouped by performer, as lazy-loaded thumbnails + - click an image (or use per-performer bulk actions) to mark it for deletion + - mark a performer "reviewed" and hide reviewed ones to track progress + - export the marked rows as a delete-list CSV, or copy a ready DELETE + statement to run in psql + +Marks and review state persist in the browser's localStorage (keyed by the CSV +filename), so you can close and resume. Nothing is written back to the DB. + +Usage: + python scripts/build_commons_audit_viewer.py commons_imagery_audit_.csv + python scripts/build_commons_audit_viewer.py audit.csv -o viewer.html +""" + +import argparse +import csv +import json +from pathlib import Path + + +def _thumb_url(row: dict, width: int = 180) -> str: + """A small Commons thumbnail via Special:FilePath (avoids full-res fetch).""" + spu = row.get("source_page_url") or "" + marker = "/wiki/File:" + if marker in spu: + fname = spu.split(marker, 1)[1] # already percent-encoded in the CSV + return f"https://commons.wikimedia.org/wiki/Special:FilePath/{fname}?width={width}" + return row.get("image_url") or "" + + +def _load_records(csv_path: Path) -> list[dict]: + records = [] + with open(csv_path, newline="") as fh: + for row in csv.DictReader(fh): + records.append({ + "performer_id": row["performer_id"], + "performer_name": row["performer_name"], + "verdict": row["verdict"], + "resolved_category": row.get("resolved_category") or "", + "image_id": row["image_id"], + "image_url": row["image_url"], + "source_page_url": row.get("source_page_url") or "", + "is_primary": str(row.get("is_primary")).lower() == "true", + "thumb": _thumb_url(row), + }) + return records + + +_HTML_TEMPLATE = r""" + + + + +__TITLE__ + + + +
+

Commons imagery audit

+ 0 performers · 0 images · + 0 marked · reviewed 0/0 + + + + + + + +
+
+ + + +""" + + +def build_html(records: list[dict], title: str, storage_key: str) -> str: + data_json = json.dumps(records, ensure_ascii=False) + return (_HTML_TEMPLATE + .replace("__TITLE__", title) + .replace("__STORAGE_KEY__", storage_key) + .replace("/*DATA*/", data_json)) + + +def main() -> None: + p = argparse.ArgumentParser( + description="Build a self-contained HTML viewer for an audit CSV.", + formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__) + p.add_argument("csv", help="commons_imagery_audit_.csv path") + p.add_argument("-o", "--output", default=None, + help="Output HTML path (default: .html)") + args = p.parse_args() + + csv_path = Path(args.csv) + if not csv_path.exists(): + raise SystemExit(f"No such file: {csv_path}") + + records = _load_records(csv_path) + out_path = Path(args.output) if args.output else csv_path.with_suffix(".html") + html = build_html(records, title=csv_path.name, storage_key=csv_path.stem) + out_path.write_text(html, encoding="utf-8") + + performers = len({r["performer_id"] for r in records}) + print(f"Wrote {out_path} — {len(records)} image(s) across {performers} performer(s)") + print(f"Open it with: open {out_path}") + + +if __name__ == "__main__": + main() diff --git a/backend/scripts/build_wikipedia_groundtruth_queue.py b/backend/scripts/build_wikipedia_groundtruth_queue.py new file mode 100644 index 0000000..a9b807c --- /dev/null +++ b/backend/scripts/build_wikipedia_groundtruth_queue.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python3 +""" +Build a verification *queue* for manually grounding performer -> Wikipedia links. + +Target: performers that HAVE Commons imagery but NO Wikipedia link on record. +These are the highest-value enrichment opportunities — we already have visual +evidence of who the person is (the Commons photos), we just haven't recorded the +authoritative Wikipedia article. A human can confirm the link quickly and that +confirmation becomes ground truth (a different, more trustworthy class of data +than anything a crawler guesses). + +For each such performer this script derives candidate Wikipedia links: + + 1. CATEGORY-DERIVED (the "implicit" link): walk the Commons categories the + performer's own image files sit in, map each category to its Wikidata item + (Commons pageprops.wikibase_item), and take that item's English Wikipedia + sitelink. This ties the candidate directly to the evidence we already hold. + 2. NAME-SEARCH FALLBACK: when no category yields a real biography article, + search Wikidata by the performer's name and surface human/group hits that + do have an English Wikipedia article. + +Junk is filtered: "Wikimedia category" items (P31=Q4167836) and sitelinks that +are themselves Category: pages are dropped — they are topic categories, not +people (e.g. Category:Public speaking). + +Output is a queue JSON under data/ground_truth/, consumed by +build_wikipedia_groundtruth_viewer.py to produce the human-verification UI. +This script only READS the database and public Wikimedia APIs; it writes nothing +back. The human's decisions become the ground-truth file (exported from the +viewer) — this is just the worklist. + +Usage: + python scripts/build_wikipedia_groundtruth_queue.py --limit 50 + python scripts/build_wikipedia_groundtruth_queue.py # full subset + python scripts/build_wikipedia_groundtruth_queue.py -o /tmp/queue.json +""" + +import argparse +import json +import logging +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +from dotenv import load_dotenv + +REPO_ROOT = Path(__file__).resolve().parents[2] +BACKEND_DIR = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(BACKEND_DIR)) # make core.* / db_utils importable (script_base does this too) +load_dotenv(BACKEND_DIR / ".env") + +from core import commons_imagery as ci # noqa: E402 (session + endpoint constants) +from db_utils import get_db_connection # noqa: E402 + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger("groundtruth_queue") + +WIKIMEDIA_CATEGORY_QID = "Q4167836" # "Wikimedia category" — a topic category, not a person +MAX_FILES_FOR_CATEGORIES = 3 # category-walk this many of a performer's files +MAX_EVIDENCE_IMAGES = 8 # thumbnails shown to the reviewer +NAME_SEARCH_LIMIT = 7 + +_SUBSET_SQL = """ + SELECT p.id, p.name, i.url, i.source_page_url, ai.is_primary, ai.display_order + FROM performers p + JOIN artist_images ai ON ai.performer_id = p.id + JOIN images i ON i.id = ai.image_id + WHERE i.source = 'wikimedia_commons' + AND btrim(COALESCE(p.wikipedia_url, '')) = '' + AND btrim(COALESCE(p.external_links->>'wikipedia', '')) = '' + ORDER BY p.name, ai.is_primary DESC, ai.display_order +""" + + +# --------------------------------------------------------------------------- # +# Wikimedia helpers (with in-run caches; many performers share categories/QIDs) +# --------------------------------------------------------------------------- # + +class WM: + def __init__(self, session, delay: float): + self.s = session + self.delay = delay + self._cat_qid: dict = {} + self._qid_info: dict = {} + self._name_search: dict = {} + + def _get(self, url, params): + if self.delay: + time.sleep(self.delay) + try: + return self.s.get(url, params=params, timeout=20).json() + except Exception as e: + logger.debug("API error %s: %s", url, e) + return {} + + def file_categories(self, file_title: str) -> list[str]: + j = self._get(ci.COMMONS_API, { + "action": "query", "format": "json", "titles": file_title, + "prop": "categories", "cllimit": "max", "clshow": "!hidden", + }) + pg = next(iter(j.get("query", {}).get("pages", {}).values()), {}) + return [c["title"] for c in pg.get("categories", [])] + + def category_qid(self, cat_title: str) -> str | None: + if cat_title in self._cat_qid: + return self._cat_qid[cat_title] + j = self._get(ci.COMMONS_API, { + "action": "query", "format": "json", "titles": cat_title, + "prop": "pageprops", "ppprop": "wikibase_item", + }) + pg = next(iter(j.get("query", {}).get("pages", {}).values()), {}) + qid = pg.get("pageprops", {}).get("wikibase_item") + self._cat_qid[cat_title] = qid + return qid + + def qid_info(self, qid: str) -> dict: + if qid in self._qid_info: + return self._qid_info[qid] + j = self._get(ci.WIKIDATA_API, { + "action": "wbgetentities", "ids": qid, "format": "json", + "props": "sitelinks|descriptions|claims", + }) + e = j.get("entities", {}).get(qid, {}) + claims = e.get("claims", {}) + def _ids(prop): + out = [] + for s in claims.get(prop, []): + v = s.get("mainsnak", {}).get("datavalue", {}).get("value") + if isinstance(v, dict) and "id" in v: + out.append(v["id"]) + return out + p18 = None + for s in claims.get("P18", []): + v = s.get("mainsnak", {}).get("datavalue", {}).get("value") + if isinstance(v, str): + p18 = v + break + enwiki = e.get("sitelinks", {}).get("enwiki", {}).get("title") + info = { + "qid": qid, + "enwiki_title": enwiki, + "description": e.get("descriptions", {}).get("en", {}).get("value"), + "p31": _ids("P31"), + "p106": _ids("P106"), + "image": p18, + } + self._qid_info[qid] = info + return info + + def name_search_qids(self, name: str) -> list[str]: + if name in self._name_search: + return self._name_search[name] + j = self._get(ci.WIKIDATA_API, { + "action": "wbsearchentities", "search": name, "language": "en", + "format": "json", "type": "item", "limit": NAME_SEARCH_LIMIT, + }) + qids = [h["id"] for h in j.get("search", []) if h.get("id")] + self._name_search[name] = qids + return qids + + +# --------------------------------------------------------------------------- # +# Candidate construction +# --------------------------------------------------------------------------- # + +def _wiki_url(enwiki_title: str | None) -> str | None: + """A real biography article URL, or None for missing / Category: sitelinks.""" + if not enwiki_title or enwiki_title.startswith("Category:"): + return None + return "https://en.wikipedia.org/wiki/" + enwiki_title.replace(" ", "_") + + +def _candidate_from_info(info: dict, *, method: str, commons_category: str | None) -> dict | None: + if WIKIMEDIA_CATEGORY_QID in info["p31"]: + return None # a topic category, not a person/group + url = _wiki_url(info["enwiki_title"]) + if not url: + return None # no usable biography article -> not a Wikipedia-link candidate + thumb = None + if info["image"]: + thumb = (f"https://commons.wikimedia.org/wiki/Special:FilePath/" + f"{info['image'].replace(' ', '_')}?width=180") + return { + "method": method, + "commons_category": commons_category, + "wikidata_qid": info["qid"], + "wikipedia_url": url, + "title": info["enwiki_title"], + "description": info["description"], + "is_human": "Q5" in info["p31"], + "thumb": thumb, + } + + +def _norm(s: str) -> str: + return "".join(ch for ch in s.lower() if ch.isalnum()) + + +def derive_candidates(wm: WM, name: str, files: list[str]) -> list[dict]: + candidates: list[dict] = [] + seen_qids: set[str] = set() + + # 1. Category-derived (the implicit link). + categories: list[str] = [] + for ftitle in files[:MAX_FILES_FOR_CATEGORIES]: + for c in wm.file_categories(ftitle): + if c not in categories: + categories.append(c) + for cat in categories: + qid = wm.category_qid(cat) + if not qid or qid in seen_qids: + continue + cand = _candidate_from_info(wm.qid_info(qid), method="category", + commons_category=cat) + if cand: + seen_qids.add(qid) + candidates.append(cand) + + # 2. Name-search fallback — only when no category produced a candidate. + if not candidates: + for qid in wm.name_search_qids(name): + if qid in seen_qids: + continue + cand = _candidate_from_info(wm.qid_info(qid), method="name_search", + commons_category=None) + if cand: + seen_qids.add(qid) + candidates.append(cand) + + # Rank: category before name-search; human before non-human; exact-name match first. + target = _norm(name) + def key(c): + return ( + 0 if c["method"] == "category" else 1, + 0 if c["is_human"] else 1, + 0 if _norm(c["title"]) == target else 1, + ) + candidates.sort(key=key) + return candidates + + +# --------------------------------------------------------------------------- # +# Main +# --------------------------------------------------------------------------- # + +def _load_subset() -> list[dict]: + """Returns [{performer_id, name, images:[{url,page,title}]}] for the subset.""" + with get_db_connection() as conn, conn.cursor() as cur: + cur.execute(_SUBSET_SQL) + rows = cur.fetchall() + grouped: dict[str, dict] = {} + order: list[str] = [] + for r in rows: + pid = str(r["id"]) + g = grouped.get(pid) + if g is None: + g = {"performer_id": pid, "name": r["name"], "images": []} + grouped[pid] = g + order.append(pid) + spu = r["source_page_url"] or "" + title = spu.split("/wiki/", 1)[1] if "/wiki/" in spu else None + g["images"].append({"url": r["url"], "page": spu, "title": title}) + return [grouped[pid] for pid in order] + + +def main() -> None: + p = argparse.ArgumentParser( + description="Build the performer->Wikipedia verification queue JSON.", + formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__) + p.add_argument("--limit", type=int, default=None, + help="Cap the number of performers (alphabetical) for a first pass") + p.add_argument("--delay", type=float, default=0.1, + help="Seconds between Wikimedia API calls (politeness; default 0.1)") + p.add_argument("-o", "--output", default=None, + help="Output queue JSON path (default: " + "data/ground_truth/wikipedia_queue_.json)") + args = p.parse_args() + + subset = _load_subset() + logger.info("Subset: %d performer(s) with Commons imagery and no Wikipedia link", + len(subset)) + if args.limit: + subset = subset[: args.limit] + logger.info("Capped to %d performer(s)", len(subset)) + + wm = WM(ci.make_session(), delay=args.delay) + records = [] + with_candidate = 0 + for idx, perf in enumerate(subset, 1): + file_titles = [img["title"] and f"File:{img['title'].split('File:')[-1]}" + for img in perf["images"] if img.get("title")] + file_titles = [f for f in file_titles if f] + candidates = derive_candidates(wm, perf["name"], file_titles) + if candidates: + with_candidate += 1 + evidence = [] + for img in perf["images"][:MAX_EVIDENCE_IMAGES]: + t = img.get("title") + thumb = (f"https://commons.wikimedia.org/wiki/Special:FilePath/" + f"{t.split('File:')[-1]}?width=180") if t and "File:" in t else img["url"] + evidence.append({"thumb": thumb, "page": img["page"], "title": t}) + records.append({ + "performer_id": perf["performer_id"], + "name": perf["name"], + "evidence_images": evidence, + "candidates": candidates, + }) + if idx % 25 == 0 or idx == len(subset): + logger.info("Processed %d/%d performer(s); %d with a candidate so far", + idx, len(subset), with_candidate) + + out_dir = REPO_ROOT / "data" / "ground_truth" + out_dir.mkdir(parents=True, exist_ok=True) + ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + out_path = Path(args.output) if args.output else out_dir / f"wikipedia_queue_{ts}.json" + payload = { + "schema": "performer_wikipedia_queue/v1", + "generated_at": datetime.now(timezone.utc).isoformat(), + "performer_count": len(records), + "with_candidate": with_candidate, + "records": records, + } + out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + logger.info("Wrote %s — %d performer(s), %d with >=1 candidate", + out_path, len(records), with_candidate) + print(out_path) + + +if __name__ == "__main__": + main() diff --git a/backend/scripts/build_wikipedia_groundtruth_viewer.py b/backend/scripts/build_wikipedia_groundtruth_viewer.py new file mode 100644 index 0000000..1cf2aa3 --- /dev/null +++ b/backend/scripts/build_wikipedia_groundtruth_viewer.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +""" +Build a self-contained HTML verification UI from a wikipedia_queue_*.json. + +Input is the queue produced by build_wikipedia_groundtruth_queue.py: performers +with Commons imagery but no Wikipedia link, each with evidence thumbnails and +candidate Wikipedia articles (category-derived, with name-search fallbacks). + +The page lets you, per performer: + - look at the Commons photos we already hold (the evidence), + - pick the candidate Wikipedia article that matches them (or paste a custom + URL, or mark "no match"), + - and export the confirmed set as a GROUND-TRUTH JSON — human-verified links, + stamped with manual provenance, suitable for re-ingest or for diffing + against automated crawlers. + +Decisions persist in the browser (localStorage, keyed by the queue filename), +so you can close and resume. Nothing is written back to the database; the +exported JSON is the deliverable. + +Usage: + python scripts/build_wikipedia_groundtruth_viewer.py data/ground_truth/wikipedia_queue_.json + python scripts/build_wikipedia_groundtruth_viewer.py queue.json -o verify.html +""" + +import argparse +import json +from pathlib import Path + + +_HTML = r""" + + + + +__TITLE__ + + + +
+

Wikipedia ground-truth

+ decided 0/0 · verified 0 · no-match 0 + + + + +
+
+ + + +""" + + +def main() -> None: + p = argparse.ArgumentParser( + description="Build the Wikipedia ground-truth verification HTML from a queue JSON.", + formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__) + p.add_argument("queue", help="wikipedia_queue_.json path") + p.add_argument("-o", "--output", default=None, help="Output HTML (default: .html)") + args = p.parse_args() + + qpath = Path(args.queue) + if not qpath.exists(): + raise SystemExit(f"No such file: {qpath}") + data = json.loads(qpath.read_text(encoding="utf-8")) + + html = (_HTML + .replace("__TITLE__", qpath.name) + .replace("__STORAGE_KEY__", qpath.stem) + .replace("__SOURCE__", qpath.name) + .replace("/*DATA*/", json.dumps(data, ensure_ascii=False))) + + out = Path(args.output) if args.output else qpath.with_suffix(".html") + out.write_text(html, encoding="utf-8") + recs = data.get("records", []) + withc = sum(1 for r in recs if r.get("candidates")) + print(f"Wrote {out} — {len(recs)} performer(s), {withc} with >=1 candidate") + print(f"Open it with: open {out}") + + +if __name__ == "__main__": + main() diff --git a/backend/tests/test_performer_commons_imagery.py b/backend/tests/test_performer_commons_imagery.py index 26f0cf7..c93a09e 100644 --- a/backend/tests/test_performer_commons_imagery.py +++ b/backend/tests/test_performer_commons_imagery.py @@ -34,8 +34,9 @@ PERFORMER_NEVER = _NS.format(0x00001) # last_imagery_check NULL -> due PERFORMER_STALE = _NS.format(0x00002) # checked 100 days ago -> due (>90d) PERFORMER_FRESH = _NS.format(0x00003) # checked 10 days ago -> not due +PERFORMER_NOWIKI = _NS.format(0x00004) # never checked but no Wikipedia URL -> skipped -_ALL_FIXTURE_IDS = (PERFORMER_NEVER, PERFORMER_STALE, PERFORMER_FRESH) +_ALL_FIXTURE_IDS = (PERFORMER_NEVER, PERFORMER_STALE, PERFORMER_FRESH, PERFORMER_NOWIKI) def _cleanup(conn): @@ -58,19 +59,29 @@ def perf_fixture(db): _cleanup(db) with db.cursor() as cur: cur.execute( - "INSERT INTO performers (id, name, last_imagery_check) " - "VALUES (%s, %s, NULL)", - (PERFORMER_NEVER, "Never Checked"), + "INSERT INTO performers (id, name, wikipedia_url, last_imagery_check) " + "VALUES (%s, %s, %s, NULL)", + (PERFORMER_NEVER, "Never Checked", + "https://en.wikipedia.org/wiki/Never_Checked"), ) cur.execute( - "INSERT INTO performers (id, name, last_imagery_check) " - "VALUES (%s, %s, now() - make_interval(days => 100))", - (PERFORMER_STALE, "Stale Checked"), + "INSERT INTO performers (id, name, wikipedia_url, last_imagery_check) " + "VALUES (%s, %s, %s, now() - make_interval(days => 100))", + (PERFORMER_STALE, "Stale Checked", + "https://en.wikipedia.org/wiki/Stale_Checked"), + ) + cur.execute( + "INSERT INTO performers (id, name, wikipedia_url, last_imagery_check) " + "VALUES (%s, %s, %s, now() - make_interval(days => 10))", + (PERFORMER_FRESH, "Fresh Checked", + "https://en.wikipedia.org/wiki/Fresh_Checked"), ) + # Due (never checked) but has no Wikipedia URL: the sweep must skip it, + # since the Commons resolver only trusts a validated Wikipedia article. cur.execute( "INSERT INTO performers (id, name, last_imagery_check) " - "VALUES (%s, %s, now() - make_interval(days => 10))", - (PERFORMER_FRESH, "Fresh Checked"), + "VALUES (%s, %s, NULL)", + (PERFORMER_NOWIKI, "No Wiki"), ) db.commit() yield @@ -116,6 +127,11 @@ def test_find_candidates_includes_never_and_stale_not_fresh(self, perf_fixture): assert PERFORMER_STALE in candidates assert PERFORMER_FRESH not in candidates + def test_find_candidates_excludes_performers_without_wikipedia(self, perf_fixture): + # Due by staleness, but no Wikipedia URL -> not a candidate. + candidates = sweep_mod.find_candidate_performer_ids() + assert PERFORMER_NOWIKI not in candidates + def test_stale_days_window_excludes_within_window(self, perf_fixture): # With a 200-day window, the 100-day-stale row is no longer due; # the never-checked row always is. diff --git a/data/ground_truth/README.md b/data/ground_truth/README.md new file mode 100644 index 0000000..a48b4a4 --- /dev/null +++ b/data/ground_truth/README.md @@ -0,0 +1,95 @@ +# Ground-truth datasets + +Human-verified reference data, kept deliberately separate from anything a +crawler produces automatically. The distinguishing marker is provenance: every +record here carries `"method": "manual"`, meaning a person looked at the +evidence and confirmed it. These files are the authoritative source for +re-ingest and for diffing against automated crawlers. + +## What's tracked vs ignored + +Committed (authoritative): +- `README.md` — this file. +- `performer_wikipedia_groundtruth*.json` — verified performer → Wikipedia links. + +Ignored (regenerable scratch — see `.gitignore`): +- `wikipedia_queue_*.json` — the verification *worklist* (rebuilt from the DB). +- `*.html` — generated verification viewers. + +## Pipeline: performer → Wikipedia links + +Goal: for performers that have Commons imagery but **no** Wikipedia link on +record, confirm the correct Wikipedia article so the system gets smarter about +them going forward. + +``` +backend/scripts/build_wikipedia_groundtruth_queue.py # DB + Wikimedia -> queue JSON +backend/scripts/build_wikipedia_groundtruth_viewer.py # queue JSON -> verification HTML +# (review in browser, click Export) # decisions -> ground-truth JSON +``` + +Candidates are **category-derived** first — the Commons category the +performer's own photos sit in → its Wikidata item → the English Wikipedia +sitelink (the "implicit" link) — falling back to a Wikidata name search when no +category yields a real biography article. + +### Schema: `performer_wikipedia_queue/v1` (worklist, ignored) + +```jsonc +{ + "schema": "performer_wikipedia_queue/v1", + "generated_at": "", + "performer_count": 1224, + "with_candidate": 812, + "records": [ + { + "performer_id": "", + "name": "?uestlove", + "evidence_images": [ { "thumb": "", "page": "", "title": "File:…" } ], + "candidates": [ + { + "method": "category" | "name_search", + "commons_category": "Category:Questlove" | null, + "wikidata_qid": "Q263024", + "wikipedia_url": "https://en.wikipedia.org/wiki/Questlove", + "title": "Questlove", + "description": "American hip hop musician, record producer and DJ", + "is_human": true, + "thumb": "" | null + } + ] + } + ] +} +``` + +### Schema: `performer_wikipedia_groundtruth/v1` (authoritative, committed) + +Exported from the viewer; only performers the reviewer actually decided on are +included. `no_match` is a real, useful decision (a crawler proposing a link +there is wrong). + +```jsonc +{ + "schema": "performer_wikipedia_groundtruth/v1", + "exported_at": "", + "source_queue": "wikipedia_queue_.json", + "record_count": 137, + "records": { + "": { + "name": "?uestlove", + "status": "verified" | "no_match", + "wikipedia_url": "https://en.wikipedia.org/wiki/Questlove" | null, + "wikidata_qid": "Q263024" | null, + "method": "manual", + "candidate_method": "category" | "name_search" | "custom" | null, + "evidence": { "commons_category": "Category:Questlove" | null }, + "verified_at": "" + } + } +} +``` + +Re-ingest (not yet built): read this file, `UPDATE performers SET wikipedia_url` +for `status == "verified"` rows, stamping a manual-provenance marker +(e.g. `updated_by = 'groundtruth_manual'`) so the distinction survives in the DB. diff --git a/sql/migrations/add_commons_imagery_enrichment.sql b/sql/migrations/add_commons_imagery_enrichment.sql deleted file mode 100644 index a6a02d1..0000000 --- a/sql/migrations/add_commons_imagery_enrichment.sql +++ /dev/null @@ -1,73 +0,0 @@ --- ============================================================================ --- Migration: Commons imagery enrichment --- Description: --- Adds the producer/handler support for the ('commons', --- 'enrich_performer_imagery') research job: --- --- 1. performers.last_imagery_check — when the performer was last swept for --- Commons imagery. The producer (core/performer_commons_imagery.py) --- treats NULL or "older than the staleness window" as due; the handler --- (research_worker/handlers/commons.py) stamps it now() on every --- completion (even a no-op). --- --- 2. A 'commons' daily quota row in source_quotas, used by the handler to --- cap the number of paid Claude vision-rerank calls per day. One unit = --- one image reranked. When the budget is spent the worker reschedules --- the job for the next reset (QuotaExhausted). --- --- Idempotent: ADD COLUMN IF NOT EXISTS + INSERT ... ON CONFLICT DO NOTHING, so --- safe to re-run. --- --- Run: psql $DATABASE_URL -f sql/migrations/add_commons_imagery_enrichment.sql --- ============================================================================ - -BEGIN; - --- ---------------------------------------------------------------------------- --- 1. performers.last_imagery_check --- ---------------------------------------------------------------------------- - -ALTER TABLE performers - ADD COLUMN IF NOT EXISTS last_imagery_check TIMESTAMPTZ; - -COMMENT ON COLUMN performers.last_imagery_check IS - 'When this performer was last swept for Commons imagery by the ' - '(commons, enrich_performer_imagery) research job. NULL = never checked. ' - 'The producer enqueues performers that are NULL or older than the ' - 'staleness window (default 90 days); the handler stamps now() on every ' - 'completion.'; - --- Supports the producer''s "due" scan (NULL-first / oldest-first). -CREATE INDEX IF NOT EXISTS idx_performers_last_imagery_check - ON performers (last_imagery_check NULLS FIRST); - --- ---------------------------------------------------------------------------- --- 2. 'commons' daily quota (caps paid Claude rerank calls/day) --- ---------------------------------------------------------------------------- --- units_limit = max images reranked per day across all performers. At the --- handler default rerank_cap of 12 images/performer, 2000 covers ~166 --- performers/day. Tune with: --- UPDATE source_quotas SET units_limit = WHERE source = 'commons'; --- resets_at uses the default 'day' window (next UTC midnight), matching --- research_worker/quota.py's _DEFAULT_RESET_SQL. - -INSERT INTO source_quotas (source, window_name, units_used, units_limit, resets_at) -VALUES ( - 'commons', - 'day', - 0, - 2000, - (date_trunc('day', now()) + interval '1 day') -) -ON CONFLICT (source, window_name) DO NOTHING; - -COMMIT; - --- ============================================================================ --- ROLLBACK (manual) --- ============================================================================ --- BEGIN; --- DROP INDEX IF EXISTS idx_performers_last_imagery_check; --- ALTER TABLE performers DROP COLUMN IF EXISTS last_imagery_check; --- DELETE FROM source_quotas WHERE source = 'commons'; --- COMMIT;