From 407398439fab56774274abdcf48f4524def3848e Mon Sep 17 00:00:00 2001 From: Jon Froehlich Date: Wed, 17 Jun 2026 16:43:33 -0700 Subject: [PATCH 1/5] feat(people): add merge_duplicate_people command + url_name helper (#1275) Phase B core tooling for consolidating duplicate Person records: - name_utils.build_unique_url_name(): derive a unique url_name preferring a readable middle-initial differentiator (jasminexzhang) for namesakes, with the legacy numeric suffix (jasminezhang2) as fallback. - merge_duplicate_people management command: decisions-CSV-driven, dry-run by default (--apply to write). Relocates every relation pointing at Person via a generic Person._meta.get_fields() walk (FKs incl. the advisor/co_advisor/ grad_mentor self-refs; sorted-M2M author/recipient sets with order preserved via .set(); plain News.people M2M), backfills only-blank scalar fields, leaves the canonical record's image untouched, is atomic per row and idempotent. - Unit + DatabaseTestCase regression tests: author-order preservation, dedup, FK/self-ref/Grant/Award/News reassignment, blanks-only backfill, dry-run no-op, idempotent re-run. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../commands/merge_duplicate_people.py | 273 ++++++++++++++++++ website/tests/test_merge_duplicate_people.py | 174 +++++++++++ website/tests/test_name_utils.py | 71 +++++ website/utils/name_utils.py | 55 ++++ 4 files changed, 573 insertions(+) create mode 100644 website/management/commands/merge_duplicate_people.py create mode 100644 website/tests/test_merge_duplicate_people.py create mode 100644 website/tests/test_name_utils.py diff --git a/website/management/commands/merge_duplicate_people.py b/website/management/commands/merge_duplicate_people.py new file mode 100644 index 00000000..3d83f993 --- /dev/null +++ b/website/management/commands/merge_duplicate_people.py @@ -0,0 +1,273 @@ +""" +Merge duplicate ``Person`` records, driven by a reviewed decisions file. + +Part of the issue #1275 dedup work. Over ~15 years the database accumulated +duplicate ``Person`` rows (one human, several rows from un-deduped imports). +This command consolidates each cluster into one canonical record, relocating +**every** related object (publications, talks, posters, grants, awards, +positions, project roles, news, and the advisor/co-advisor/grad-mentor +self-references) before deleting the now-empty duplicate. + +Why a management command (not a data migration): migrations are gitignored and +regenerated per-environment here, so the established pattern is a one-shot +command verified via the logs (see ``generate_slugs_for_old_news_items`` etc.). + +Safety model: + * **Dry-run by default.** Without ``--apply`` it prints the plan and changes + nothing. Pass ``--apply`` to actually mutate. + * **Idempotent.** A ``merge`` whose source row is already gone is a no-op, so + re-running an applied decisions file is safe. + * **Atomic per row.** Each merge runs in ``transaction.atomic()``. + * **Generic relation walk.** Relations are discovered via + ``Person._meta.get_fields()`` rather than hardcoded, so a new FK/M2M to + Person can't silently orphan data. + +Decisions file: CSV with columns ``source_id, action, target_id, note``. + * ``merge`` — relocate everything from ``source_id`` onto ``target_id``, then + delete ``source_id``. + * ``delete`` — delete ``source_id`` (refused unless it has zero references). + * ``keep`` — no-op; documents a reviewed namesake to leave alone. +Ids only (no emails) so the reviewed file is safe to commit to the public repo; +member names are already public. + +Usage: + python manage.py merge_duplicate_people --decisions dedup_decisions.csv + python manage.py merge_duplicate_people --decisions dedup_decisions.csv --apply +""" + +import csv +import logging + +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction + +from website.models import Person + +_logger = logging.getLogger(__name__) + +# Scalar (non-relational) fields whose *blank* value on the target is backfilled +# from the source during a merge. We never overwrite a populated target field, +# and we deliberately leave the image/cropping fields and the derived url_name / +# bio_datetime_modified alone (the target is the chosen canonical record; its +# headshot stays put — see module docstring / issue #1275). +SCALAR_BACKFILL_FIELDS = [ + 'email', 'personal_website', 'github', 'twitter', 'linkedin', + 'mastodon', 'threads', 'bluesky', 'bio', 'next_position', + 'next_position_url', +] + + +def _person_reverse_relations(): + """ + Return ``(fk_rels, m2m_rels)`` — the auto-created reverse relations that + point at ``Person``, discovered generically from the model meta. + + ``fk_rels`` are reverse foreign keys / one-to-ones (``ManyToOneRel`` / + ``OneToOneRel``), including the three Position advisor self-references. + ``m2m_rels`` are reverse many-to-manys (``ManyToManyRel``), e.g. the sorted + ``authors`` / ``recipients`` sets and the plain ``News.people`` set. + """ + fk_rels, m2m_rels = [], [] + for field in Person._meta.get_fields(): + if not (field.is_relation and field.auto_created and not field.concrete): + continue + if field.many_to_many: + m2m_rels.append(field) + elif field.one_to_many or field.one_to_one: + fk_rels.append(field) + return fk_rels, m2m_rels + + +def count_references(person): + """Total number of objects across all relations pointing at ``person``.""" + fk_rels, m2m_rels = _person_reverse_relations() + total = 0 + for field in fk_rels: + total += field.related_model.objects.filter(**{field.field.name: person}).count() + for field in m2m_rels: + total += getattr(person, field.get_accessor_name()).count() + return total + + +class Command(BaseCommand): + help = 'Merge duplicate Person records from a reviewed decisions CSV (dry-run by default).' + + def add_arguments(self, parser): + parser.add_argument( + '--decisions', required=True, + help='Path to the decisions CSV (columns: source_id, action, target_id, note).', + ) + parser.add_argument( + '--apply', action='store_true', + help='Actually perform the merges/deletes. Without this flag the command is a dry-run.', + ) + + def handle(self, *args, **options): + decisions = self._read_decisions(options['decisions']) + apply = options['apply'] + + mode = 'APPLY' if apply else 'DRY-RUN' + self.stdout.write(f"=== merge_duplicate_people [{mode}] — {len(decisions)} decision(s) ===") + + for row in decisions: + action = row['action'] + if action == 'merge': + self._handle_merge(row, apply) + elif action == 'delete': + self._handle_delete(row, apply) + elif action == 'keep': + self.stdout.write(f" keep source={row['source_id']} (namesake / leave alone){self._note(row)}") + else: + self.stdout.write(self.style.WARNING( + f" SKIP unknown action {action!r} for source={row['source_id']}")) + + if not apply: + self.stdout.write(self.style.WARNING( + "\nDry-run only — no changes made. Re-run with --apply to perform these actions.")) + + # ----- decisions parsing ------------------------------------------------- + + def _read_decisions(self, path): + """Parse and validate the decisions CSV into a list of normalized rows.""" + try: + with open(path, newline='', encoding='utf-8') as fh: + rows = list(csv.DictReader(fh)) + except OSError as exc: + raise CommandError(f"Could not read decisions file {path!r}: {exc}") + + required = {'source_id', 'action'} + if not rows or not required.issubset({k.strip() for k in rows[0].keys()}): + raise CommandError( + "Decisions CSV must have a header row with at least " + "'source_id' and 'action' columns (plus 'target_id', 'note').") + + cleaned = [] + for i, raw in enumerate(rows, start=2): # row 1 is the header + row = {(k or '').strip(): (v or '').strip() for k, v in raw.items()} + if not row.get('source_id'): + continue # allow blank spacer rows + row['action'] = row.get('action', '').lower() + if row['action'] == 'merge' and not row.get('target_id'): + raise CommandError(f"Row {i}: action 'merge' requires a target_id.") + cleaned.append(row) + return cleaned + + @staticmethod + def _note(row): + return f" # {row['note']}" if row.get('note') else '' + + # ----- actions ----------------------------------------------------------- + + def _handle_merge(self, row, apply): + source = self._get_person(row['source_id']) + if source is None: + self.stdout.write(f" no-op source={row['source_id']} already gone (merged?){self._note(row)}") + return + target = self._get_person(row['target_id']) + if target is None: + self.stdout.write(self.style.ERROR( + f" ERROR target={row['target_id']} not found for source={row['source_id']} — skipping")) + return + if source.pk == target.pk: + self.stdout.write(self.style.ERROR( + f" ERROR source and target are the same person ({source.pk}) — skipping")) + return + + if not apply: + self.stdout.write( + f" merge {source.pk} ({source.get_full_name()}, refs={count_references(source)}) " + f"-> {target.pk} ({target.get_full_name()}, refs={count_references(target)})" + f"{self._note(row)}") + return + + with transaction.atomic(): + summary, backfilled = self._merge(source, target) + moved = ', '.join(f"{k}={v}" for k, v in summary.items()) or 'no related objects' + backfill = f"; backfilled {', '.join(backfilled)}" if backfilled else '' + msg = f"merged {row['source_id']} into {target.pk}: {moved}{backfill}" + self.stdout.write(self.style.SUCCESS(f" {msg}")) + _logger.info("merge_duplicate_people: %s", msg) + + def _handle_delete(self, row, apply): + source = self._get_person(row['source_id']) + if source is None: + self.stdout.write(f" no-op source={row['source_id']} already gone{self._note(row)}") + return + refs = count_references(source) + if refs != 0: + self.stdout.write(self.style.ERROR( + f" ERROR refusing to delete {source.pk} ({source.get_full_name()}) — " + f"has {refs} reference(s); merge it instead")) + return + if not apply: + self.stdout.write(f" delete {source.pk} ({source.get_full_name()}, refs=0){self._note(row)}") + return + source.delete() + self.stdout.write(self.style.SUCCESS(f" deleted {row['source_id']} (was a 0-ref shell)")) + _logger.info("merge_duplicate_people: deleted shell %s", row['source_id']) + + # ----- core merge -------------------------------------------------------- + + def _merge(self, source, target): + """ + Relocate every relation from ``source`` onto ``target``, backfill blank + scalar fields, then delete ``source``. Returns ``(summary, backfilled)``. + Must run inside an atomic block. + """ + fk_rels, m2m_rels = _person_reverse_relations() + summary = {} + + # Reverse FKs (incl. advisor / co_advisor / grad_mentor self-refs): just + # repoint the foreign key. SET_NULL/CASCADE on_delete is irrelevant once + # nothing points at source. + for field in fk_rels: + model, fk = field.related_model, field.field.name + n = model.objects.filter(**{fk: source}).update(**{fk: target}) + if n: + summary[f"{model.__name__}.{fk}"] = n + + # Reverse M2Ms: rebuild each related object's ordered author/recipient/ + # people list with source swapped for target (dropping source if target + # is already present — dedup). For SortedManyToManyField, .set() assigns + # sort_value by list position, so author order is preserved. + for field in m2m_rels: + forward = field.field.name # 'authors' / 'recipients' / 'people' + moved = 0 + for obj in list(getattr(source, field.get_accessor_name()).all()): + manager = getattr(obj, forward) + new_members, seen = [], set() + for member in manager.all(): # ordered for sorted M2M + replacement = target if member.pk == source.pk else member + if replacement.pk not in seen: + seen.add(replacement.pk) + new_members.append(replacement) + manager.set(new_members) + moved += 1 + if moved: + summary[f"{field.related_model.__name__}.{forward}"] = moved + + # Scalar backfill: fill target's blanks from source; never overwrite. + backfilled = [] + for fld in SCALAR_BACKFILL_FIELDS: + if not getattr(target, fld) and getattr(source, fld): + setattr(target, fld, getattr(source, fld)) + backfilled.append(fld) + if backfilled: + # Bypass Person.save() (avoids url_name recompute + Star Wars image + # side effects); we only want the scalar columns written. + Person.objects.filter(pk=target.pk).update( + **{fld: getattr(target, fld) for fld in backfilled}) + + # Delete the now-orphaned source. Its pre_delete signal removes source's + # own image file (target's image is untouched, so nothing shared breaks). + source.delete() + return summary, backfilled + + @staticmethod + def _get_person(pk): + if not pk: + return None + try: + return Person.objects.get(pk=int(pk)) + except (Person.DoesNotExist, ValueError): + return None diff --git a/website/tests/test_merge_duplicate_people.py b/website/tests/test_merge_duplicate_people.py new file mode 100644 index 00000000..2e5477d2 --- /dev/null +++ b/website/tests/test_merge_duplicate_people.py @@ -0,0 +1,174 @@ +""" +Regression tests for the merge_duplicate_people management command (#1275). + +Verifies that merging a duplicate Person B into a canonical Person A relocates +EVERY relation type (FK, the advisor self-references, sorted-M2M author sets, +plain M2M), preserves publication author order, backfills only-blank scalar +fields, and is both dry-run-safe and idempotent. +""" + +import csv +import tempfile +from datetime import date + +from django.core.management import call_command + +from website.models import ( + Award, + Grant, + News, + Person, + Position, + ProjectRole, + Sponsor, +) +from website.models.position import Title +from website.tests.base import DatabaseTestCase + + +def _decisions_file(rows): + """Write a decisions CSV to a temp path and return it.""" + fh = tempfile.NamedTemporaryFile( + mode='w', suffix='.csv', delete=False, newline='', encoding='utf-8') + writer = csv.DictWriter(fh, fieldnames=['source_id', 'action', 'target_id', 'note']) + writer.writeheader() + for row in rows: + writer.writerow(row) + fh.close() + return fh.name + + +class MergeDuplicatePeopleTests(DatabaseTestCase): + def setUp(self): + # A = canonical/target, B = duplicate/source. + self.target = self.make_person("Jennifer", "Mankoff") + self.source = self.make_person("Jennifer", "Mankoff") + + def _merge(self, apply=True, action='merge', target_id=None): + path = _decisions_file([{ + 'source_id': self.source.pk, + 'action': action, + 'target_id': target_id if target_id is not None else self.target.pk, + 'note': 'test', + }]) + call_command('merge_duplicate_people', decisions=path, apply=apply) + + # ----- FK + advisor self-ref reassignment -------------------------------- + + def test_merge_reassigns_fk_and_advisor_self_refs(self): + other = self.make_person("Some", "Advisee") + # B is the person on one position, and the advisor on another. + own = Position.objects.create( + person=self.source, start_date=date(2020, 1, 1), title=Title.PHD_STUDENT) + advised = Position.objects.create( + person=other, advisor=self.source, co_advisor=self.source, + grad_mentor=self.source, start_date=date(2020, 1, 1), title=Title.PHD_STUDENT) + project = self.make_project("Proj") + role = ProjectRole.objects.create( + person=self.source, project=project, start_date=date(2020, 1, 1)) + news = self.make_news_item(author=self.source) + + self._merge() + + self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) + own.refresh_from_db() + advised.refresh_from_db() + role.refresh_from_db() + news.refresh_from_db() + self.assertEqual(own.person, self.target) + self.assertEqual(advised.advisor, self.target) + self.assertEqual(advised.co_advisor, self.target) + self.assertEqual(advised.grad_mentor, self.target) + self.assertEqual(role.person, self.target) + self.assertEqual(news.author, self.target) + + # ----- sorted-M2M order preservation + dedup ----------------------------- + + def test_merge_preserves_publication_author_order(self): + first = self.make_person("Aa", "Author") + last = self.make_person("Zz", "Author") + pub = self.make_publication() + pub.authors.set([first, self.source, last]) # B sits in the middle slot + + self._merge() + + pub.refresh_from_db() + # A takes B's exact position; order otherwise unchanged. + self.assertEqual(list(pub.authors.all()), [first, self.target, last]) + + def test_merge_dedupes_when_target_already_coauthor(self): + pub = self.make_publication() + pub.authors.set([self.source, self.target]) # both already on the paper + + self._merge() + + pub.refresh_from_db() + self.assertEqual(list(pub.authors.all()), [self.target]) + + def test_merge_moves_award_recipients_and_grant_authors(self): + award = Award.objects.create(title="Best Paper", date=date(2020, 1, 1)) + award.recipients.set([self.source]) + grant = Grant.objects.create( + sponsor=Sponsor.objects.create(name="NSF"), + title="A Grant", date=date(2020, 1, 1)) + grant.authors.set([self.source]) + + self._merge() + + award.refresh_from_db() + grant.refresh_from_db() + self.assertEqual(list(award.recipients.all()), [self.target]) + self.assertEqual(list(grant.authors.all()), [self.target]) + + def test_merge_moves_plain_news_people_m2m(self): + news = self.make_news_item() + news.people.set([self.source]) + + self._merge() + + news.refresh_from_db() + self.assertEqual(list(news.people.all()), [self.target]) + + # ----- scalar backfill --------------------------------------------------- + + def test_scalar_backfill_fills_blanks_only(self): + Person.objects.filter(pk=self.target.pk).update( + email="keep@uw.edu", github="") + Person.objects.filter(pk=self.source.pk).update( + email="lose@uw.edu", github="sourcehub") + + self._merge() + + self.target.refresh_from_db() + self.assertEqual(self.target.email, "keep@uw.edu") # populated: untouched + self.assertEqual(self.target.github, "sourcehub") # blank: backfilled + + # ----- safety: dry-run + idempotency ------------------------------------- + + def test_dry_run_makes_no_changes(self): + pub = self.make_publication() + pub.authors.set([self.source]) + + self._merge(apply=False) + + self.assertTrue(Person.objects.filter(pk=self.source.pk).exists()) + pub.refresh_from_db() + self.assertEqual(list(pub.authors.all()), [self.source]) + + def test_second_apply_is_a_noop(self): + self.make_news_item(author=self.source) + self._merge() + self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) + # Re-running the same decisions file must not raise and must change nothing. + self._merge() + self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) + self.assertTrue(Person.objects.filter(pk=self.target.pk).exists()) + + def test_delete_action_refuses_when_refs_exist(self): + self.make_news_item(author=self.source) + path = _decisions_file([{ + 'source_id': self.source.pk, 'action': 'delete', + 'target_id': '', 'note': 'shell?'}]) + call_command('merge_duplicate_people', decisions=path, apply=True) + # Has a reference, so delete is refused — source survives. + self.assertTrue(Person.objects.filter(pk=self.source.pk).exists()) diff --git a/website/tests/test_name_utils.py b/website/tests/test_name_utils.py new file mode 100644 index 00000000..1bbb2cf6 --- /dev/null +++ b/website/tests/test_name_utils.py @@ -0,0 +1,71 @@ +""" +Unit tests for website.utils.name_utils (pure logic, no DB). + +Covers the url_name normalization key and the readable-URL derivation +(``build_unique_url_name``) used by ``Person.save()`` and the +``recompute_url_names`` command for the #1275 dedup / #1206 namesake work. +""" + +from django.test import SimpleTestCase + +from website.utils.name_utils import ( + build_unique_url_name, + normalize_person_name, +) + + +class NormalizePersonNameTests(SimpleTestCase): + def test_basic_lowercases_and_concatenates(self): + self.assertEqual(normalize_person_name('Jon', 'Froehlich'), 'jonfroehlich') + + def test_accents_folded_and_punctuation_stripped(self): + self.assertEqual(normalize_person_name('Renée', "O'Brien"), 'reneeobrien') + + def test_handles_none(self): + self.assertEqual(normalize_person_name(None, 'Zhang'), 'zhang') + + +class BuildUniqueUrlNameTests(SimpleTestCase): + def test_returns_bare_key_when_free(self): + never_taken = lambda u: False + self.assertEqual( + build_unique_url_name('Jasmine', 'Xin', 'Zhang', never_taken), + 'jasminezhang', + ) + + def test_prefers_middle_initial_on_collision(self): + taken = {'jasminezhang'}.__contains__ + self.assertEqual( + build_unique_url_name('Jasmine', 'Xin', 'Zhang', taken), + 'jasminexzhang', + ) + + def test_numeric_fallback_when_no_middle_name(self): + taken = {'jasminezhang'}.__contains__ + self.assertEqual( + build_unique_url_name('Jasmine', '', 'Zhang', taken), + 'jasminezhang2', + ) + + def test_numeric_fallback_when_middle_initial_also_taken(self): + taken = {'jasminezhang', 'jasminexzhang'}.__contains__ + self.assertEqual( + build_unique_url_name('Jasmine', 'Xin', 'Zhang', taken), + 'jasminezhang2', + ) + + def test_numeric_fallback_keeps_incrementing(self): + taken = {'jasminezhang', 'jasminezhang2', 'jasminezhang3'}.__contains__ + # No middle name, so it must skip past every taken numeric suffix. + self.assertEqual( + build_unique_url_name('Jasmine', None, 'Zhang', taken), + 'jasminezhang4', + ) + + def test_middle_initial_is_accent_folded(self): + # Middle name starting with an accented char folds to its ASCII initial. + taken = {'jonfroehlich'}.__contains__ + self.assertEqual( + build_unique_url_name('Jon', 'Über', 'Froehlich', taken), + 'jonufroehlich', + ) diff --git a/website/utils/name_utils.py b/website/utils/name_utils.py index 7ac5dca0..ca953410 100644 --- a/website/utils/name_utils.py +++ b/website/utils/name_utils.py @@ -51,6 +51,61 @@ def normalize_person_name(first_name, last_name): return re.sub('[^a-zA-Z]', '', cleaned) +def build_unique_url_name(first_name, middle_name, last_name, is_taken): + """ + Derive a unique ``url_name`` for a person, preferring readable URLs. + + Resolution order: + 1. The bare key ``normalize_person_name(first, last)`` (e.g. ``jasminezhang``) + if it isn't already taken. + 2. A **middle-initial differentiator** ``first + middle_initial + last`` + (e.g. ``jasminexzhang``) when a middle name is present and that form is + free — giving namesakes a stable, human-readable URL (issue #1206/#1275). + 3. A **numeric suffix** fallback (``jasminezhang2``, ``jasminezhang3`` …) — + the historical behavior, used when there's no usable middle initial or + the middle-initial form is itself taken. + + ``is_taken`` is a callable ``str -> bool`` that reports whether a candidate + ``url_name`` is already in use. The caller defines its semantics: when + re-deriving for an existing row it should exclude that row's own pk (mirroring + the ``.exclude(pk=self.pk)`` check in ``Person.save()``); when assigning in a + batch it should consult the names already handed out this pass. + + The result is always non-empty and lowercase alpha (plus an optional trailing + number), matching what ``Person.save()`` would store. + + Example: + >>> taken = {'jasminezhang'}.__contains__ + >>> build_unique_url_name('Jasmine', 'Xin', 'Zhang', taken) + 'jasminexzhang' + >>> build_unique_url_name('Jasmine', '', 'Zhang', taken) + 'jasminezhang2' + """ + base = normalize_person_name(first_name, last_name) + if not is_taken(base): + return base + + # Middle-initial differentiator: fold the first alpha char of the middle name + # the same way url_name derivation folds accents, then drop anything non-alpha. + middle = (middle_name or '').strip() + if middle: + initial = middle[0].lower() + initial = SPECIAL_CHARS.get(initial, initial) + initial = re.sub('[^a-z]', '', initial) + if initial: + first_key = normalize_person_name(first_name, '') + last_key = normalize_person_name('', last_name) + candidate = f"{first_key}{initial}{last_key}" + if candidate != base and not is_taken(candidate): + return candidate + + # Numeric-suffix fallback (matches the legacy Person.save() collision loop). + counter = 2 + while is_taken(f"{base}{counter}"): + counter += 1 + return f"{base}{counter}" + + def is_default_person_image(image_field): """ Return True if a Person image field looks like an auto-assigned default From aff5667cb84605161f54bfe610acaf77b6c40e3e Mon Sep 17 00:00:00 2001 From: Jon Froehlich Date: Wed, 17 Jun 2026 16:47:44 -0700 Subject: [PATCH 2/5] fix(people): de-collide url_names + harden member view (#1206/#1275) Namesake / collision safety net (Phase A), built on the shared build_unique_url_name helper: - Person.save(): derive url_name via build_unique_url_name (readable middle-initial differentiator for namesakes, numeric-suffix fallback); drops the inlined special_chars map + numeric loop (now dead code). - recompute_url_names command: re-derive a unique url_name for every Person in pk order (earliest keeps the bare name), writing via .update() to skip save() side effects. Idempotent, --dry-run; wired into docker-entrypoint.sh (4.8) so historical collisions self-heal on deploy. This is the durable #1206 fix. - member view: on the now-unreachable MultipleObjectsReturned, log loudly and raise a clean Http404 instead of the silent .first() band-aid (which could surface the wrong namesake and hide a real 500). - Tests: recompute de-collision (numeric + middle-initial), idempotency, dry-run no-op; save() namesake derivation; member page resolves 200 for each de-collided namesake and returns 404 (not 500) on an unresolved collision. Co-Authored-By: Claude Opus 4.8 (1M context) --- docker-entrypoint.sh | 5 + .../commands/recompute_url_names.py | 73 +++++++++++++ website/models/person.py | 45 +++----- website/tests/test_recompute_url_names.py | 102 ++++++++++++++++++ website/views/member.py | 18 ++-- 5 files changed, 202 insertions(+), 41 deletions(-) create mode 100644 website/management/commands/recompute_url_names.py create mode 100644 website/tests/test_recompute_url_names.py diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index e27fa2ff..3b324335 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -115,6 +115,11 @@ echo "4.7 Running 'python manage.py backfill_project_visibility' to resolve is_v echo "******************************************" python manage.py backfill_project_visibility +echo "****************** STEP 4.8/5: docker-entrypoint.sh ************************" +echo "4.8 Running 'python manage.py recompute_url_names' to de-collide historical url_names (#1206)" +echo "******************************************" +python manage.py recompute_url_names + # echo "****************** STEP 4.3/5: docker-entrypoint.sh ************************" # echo "4.3 Running 'python manage.py rename_person_images' to rename person images" # echo "******************************************" diff --git a/website/management/commands/recompute_url_names.py b/website/management/commands/recompute_url_names.py new file mode 100644 index 00000000..539d2c4e --- /dev/null +++ b/website/management/commands/recompute_url_names.py @@ -0,0 +1,73 @@ +""" +Recompute every Person.url_name so historical collisions de-collide (#1206). + +Person.url_name is derived in Person.save(), but the collision-resolution loop +only protects rows saved *after* it landed. Rows imported/created earlier and +never re-saved can still share a bare url_name, which causes +MultipleObjectsReturned -> HTTP 500 on /member//. This command +re-derives a unique url_name for every person using the same shared logic +(website.utils.name_utils.build_unique_url_name), giving namesakes stable, +readable URLs (a middle-initial differentiator where possible, else a numeric +suffix). + +People are processed in ascending pk order, so within a same-name cluster the +earliest record keeps the bare url_name (e.g. ``jasminezhang``) and later ones +get differentiated (``jasminexzhang`` / ``jasminezhang2``). Assignment is +deterministic, so the command is idempotent — re-running changes nothing. + +Writes use ``.update()`` to set only the url_name column, deliberately bypassing +Person.save() (no Star Wars image fallback, no bio_datetime_modified churn). +Runs on every container start (docker-entrypoint.sh) and is safe to re-run. + +Usage: + python manage.py recompute_url_names # apply + python manage.py recompute_url_names --dry-run # preview changes only +""" + +import logging + +from django.core.management.base import BaseCommand + +from website.models import Person +from website.utils.name_utils import build_unique_url_name + +_logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Re-derive a unique url_name for every Person (de-collides historical duplicates, #1206)." + + def add_arguments(self, parser): + parser.add_argument( + '--dry-run', action='store_true', + help='Report what would change without writing anything.', + ) + + def handle(self, *args, **options): + dry_run = options['dry_run'] + assigned = set() + changes = [] # (pk, old, new) for rows whose url_name actually changes + + # Ascending pk = creation order, so the earliest row in a name cluster + # keeps the bare url_name and later namesakes get differentiated. + for person in Person.objects.order_by('pk').only( + 'pk', 'first_name', 'middle_name', 'last_name', 'url_name'): + new_url_name = build_unique_url_name( + person.first_name, person.middle_name, person.last_name, + is_taken=assigned.__contains__, + ) + assigned.add(new_url_name) + if new_url_name != person.url_name: + changes.append((person.pk, person.url_name, new_url_name)) + + for pk, old, new in changes: + _logger.debug("recompute_url_names: %s -> %s (pk=%s)", old, new, pk) + self.stdout.write(f" {old or '(blank)'} -> {new} (pk={pk})") + if not dry_run: + Person.objects.filter(pk=pk).update(url_name=new) + + verb = 'would change' if dry_run else 'changed' + summary = f"recompute_url_names: {verb} {len(changes)} url_name(s)." + self.stdout.write(self.style.SUCCESS(summary)) + if not dry_run: + _logger.info(summary) diff --git a/website/models/person.py b/website/models/person.py index ee3fdc25..1a17a5cd 100644 --- a/website/models/person.py +++ b/website/models/person.py @@ -7,6 +7,7 @@ from django.core.files import File import website.utils.fileutils as ml_fileutils from website.utils.upload_validators import validate_image_upload +from website.utils.name_utils import build_unique_url_name from django.db.models.functions import Coalesce from django.conf import settings @@ -22,7 +23,6 @@ import os # for joining paths from uuid import uuid4 # for generating unique filenames -import re from datetime import date, timedelta from image_cropping import ImageRatioField @@ -36,15 +36,6 @@ # This retrieves a Python logging instance (or creates it) _logger = logging.getLogger(__name__) -# Special character mappings -special_chars = { - 'ã': 'a', 'à': 'a', 'â': 'a', - 'é': 'e', 'è': 'e', 'ê': 'e', - 'ñ': 'n', 'ń': 'n', - 'ö': 'o', 'ô': 'o', - 'û': 'u', 'ü': 'u', 'ù': 'u' -} - PERSON_THUMBNAIL_SIZE = (245, 245) def get_unique_filename_for_person(person, original_filename, append_str=None, force_unique=True): @@ -638,29 +629,17 @@ def __str__(self): def save(self, *args, **kwargs): - # First, automatically set the url_name field - # Substitute any common special characters. I haven't found a better automatic way to do - # this, so we are manually mapping 'common' special characters. - url_name_cleaned = (self.first_name + self.last_name).lower() - for c in url_name_cleaned: - if bool(re.search('[^a-zA-Z]', c)) and c in special_chars: - url_name_cleaned = url_name_cleaned.replace(c, special_chars.get(c)) - - # Clean remaining characters (EX: dashes, periods). - url_name_cleaned = re.sub('[^a-zA-Z]', '', url_name_cleaned) - - # Check for collisions and append numeric suffix if needed - # We exclude the current person (by pk) when checking for duplicates to allow updates - base_url_name = url_name_cleaned - counter = 2 - - # Keep incrementing counter until we find a unique url_name - while Person.objects.filter(url_name=url_name_cleaned).exclude(pk=self.pk).exists(): - url_name_cleaned = f"{base_url_name}{counter}" - counter += 1 - _logger.debug(f"URL name collision detected for {self.get_full_name()}. Trying {url_name_cleaned}") - - self.url_name = url_name_cleaned + # Automatically derive a unique url_name. The accent-folding + collision + # logic lives in build_unique_url_name (shared with recompute_url_names): + # it prefers the bare key (jonfroehlich), then a readable middle-initial + # differentiator for namesakes (jasminexzhang), then a numeric suffix + # (jasminezhang2). We exclude this row by pk so re-saving keeps its name. + self.url_name = build_unique_url_name( + self.first_name, self.middle_name, self.last_name, + is_taken=lambda candidate: ( + Person.objects.filter(url_name=candidate).exclude(pk=self.pk).exists() + ), + ) # Next, automatically set the bio_date_modified field if self.pk is not None: # checks if this is an existing object diff --git a/website/tests/test_recompute_url_names.py b/website/tests/test_recompute_url_names.py new file mode 100644 index 00000000..98df6042 --- /dev/null +++ b/website/tests/test_recompute_url_names.py @@ -0,0 +1,102 @@ +""" +Regression tests for url_name de-collision (#1206 / #1275): + +- the recompute_url_names management command, +- the refactored Person.save() derivation (middle-initial / numeric), and +- the hardened member view (clean 404 instead of a 500 or silent .first()). +""" + +from django.core.management import call_command +from django.urls import reverse + +from website.models import Person +from website.tests.base import DatabaseTestCase + + +def _force_shared_url_name(*people, url_name): + """Simulate historical pre-collision-loop rows sharing a bare url_name + (bypasses Person.save(), which would otherwise auto-de-collide).""" + Person.objects.filter(pk__in=[p.pk for p in people]).update(url_name=url_name) + + +class RecomputeUrlNamesCommandTests(DatabaseTestCase): + def test_decollides_to_distinct_url_names(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") # a.pk < b.pk + _force_shared_url_name(a, b, url_name="jiahaoli") + + call_command('recompute_url_names') + + a.refresh_from_db() + b.refresh_from_db() + self.assertEqual(a.url_name, "jiahaoli") # earliest keeps the bare name + self.assertEqual(b.url_name, "jiahaoli2") # no middle name -> numeric suffix + + def test_prefers_middle_initial_differentiator(self): + a = self.make_person("Jasmine", "Zhang") + b = self.make_person("Jasmine", "Zhang", middle_name="Xin") + _force_shared_url_name(a, b, url_name="jasminezhang") + + call_command('recompute_url_names') + + a.refresh_from_db() + b.refresh_from_db() + self.assertEqual(a.url_name, "jasminezhang") + self.assertEqual(b.url_name, "jasminexzhang") + + def test_is_idempotent(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + _force_shared_url_name(a, b, url_name="jiahaoli") + + call_command('recompute_url_names') + first = list(Person.objects.order_by('pk').values_list('url_name', flat=True)) + call_command('recompute_url_names') + second = list(Person.objects.order_by('pk').values_list('url_name', flat=True)) + self.assertEqual(first, second) + + def test_dry_run_changes_nothing(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + _force_shared_url_name(a, b, url_name="jiahaoli") + + call_command('recompute_url_names', dry_run=True) + + a.refresh_from_db() + b.refresh_from_db() + self.assertEqual(a.url_name, "jiahaoli") + self.assertEqual(b.url_name, "jiahaoli") # untouched + + +class PersonSaveUrlNameTests(DatabaseTestCase): + def test_new_namesake_with_middle_name_gets_middle_initial(self): + self.make_person("Jasmine", "Zhang") # claims jasminezhang + b = self.make_person("Jasmine", "Zhang", middle_name="Xin") + self.assertEqual(b.url_name, "jasminexzhang") + + def test_new_namesake_without_middle_name_gets_numeric_suffix(self): + self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + self.assertEqual(b.url_name, "jiahaoli2") + + +class MemberViewCollisionTests(DatabaseTestCase): + def test_namesakes_each_resolve_200_after_recompute(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + _force_shared_url_name(a, b, url_name="jiahaoli") + call_command('recompute_url_names') + + a.refresh_from_db() + b.refresh_from_db() + for person in (a, b): + url = reverse('website:member_by_name', kwargs={'member_name': person.url_name}) + self.assertEqual(self.client.get(url).status_code, 200) + + def test_unresolved_collision_returns_404_not_500(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + _force_shared_url_name(a, b, url_name="jiahaoli") + + url = reverse('website:member_by_name', kwargs={'member_name': 'jiahaoli'}) + self.assertEqual(self.client.get(url).status_code, 404) diff --git a/website/views/member.py b/website/views/member.py index c438001e..3657b673 100644 --- a/website/views/member.py +++ b/website/views/member.py @@ -64,14 +64,16 @@ def member(request, member_name=None, member_id=None): # Try a case-insensitive exact match person = get_object_or_404(Person, url_name__iexact=member_name) except MultipleObjectsReturned: - # This should not happen if url_name uniqueness is working correctly - # Log error and return the most recently bio-modified person as fallback - # (Person tracks bio_datetime_modified, not modified_date — the latter - # is not a field on the model and would raise FieldError.) - _logger.error(f"Multiple people found with url_name={member_name}! This indicates url_name uniqueness is broken. Returning most recent.") - person = Person.objects.filter(url_name__iexact=member_name).order_by('-bio_datetime_modified').first() - if person is None: - raise Http404("No person matches the given query.") + # url_name is kept unique by Person.save() and the recompute_url_names + # command (#1206/#1275), so this branch should be unreachable. If a + # collision ever recurs, fail loudly with a clean 404 rather than a 500 + # or a silent .first() pick (which would make one namesake permanently + # unreachable and could surface the wrong person). The fix is always to + # re-run `manage.py recompute_url_names`, not to guess a winner here. + _logger.error( + f"Multiple people share url_name={member_name!r} — url_name uniqueness is " + f"broken; run `manage.py recompute_url_names`. Returning 404.") + raise Http404("No unique person matches the given query.") except Http404: _logger.debug(f"{member_name} not found for url_name, looking for closest match in database") closest_urlname = get_closest_urlname_in_database(member_name) From 19beaff3c97a5d21b93aa92ae918bb22159c9418 Mon Sep 17 00:00:00 2001 From: Jon Froehlich Date: Wed, 17 Jun 2026 16:54:42 -0700 Subject: [PATCH 3/5] fix(people): fold accents via Unicode NFKD in name normalization (#1275) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit normalize_person_name used a hand-maintained accent map that only covered grave/circumflex/tilde vowels. Acute accents and the cedilla were neither folded nor matched, so the trailing [^a-zA-Z] strip *deleted* them — mangling url_names ('Claudio' with an acute a -> 'cludiosilva', 'Francois' with cedilla -> 'franoisguimbretiere') and silently hiding accented-name duplicates from the DuplicatePeopleCheck dashboard. Replace the map with standard Unicode NFKD folding (_ascii_fold), which handles the whole Latin range generically and needs no hand-syncing. This corrects 5 mangled url_names in prod data and reveals one previously-hidden true duplicate (Claudio Silva, ids 669+720) for the merge decisions file. build_unique_url_name's middle-initial folding uses the same helper. Co-Authored-By: Claude Opus 4.8 (1M context) --- website/tests/test_name_utils.py | 16 +++++++++++ website/utils/name_utils.py | 49 ++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/website/tests/test_name_utils.py b/website/tests/test_name_utils.py index 1bbb2cf6..771e59bb 100644 --- a/website/tests/test_name_utils.py +++ b/website/tests/test_name_utils.py @@ -21,6 +21,22 @@ def test_basic_lowercases_and_concatenates(self): def test_accents_folded_and_punctuation_stripped(self): self.assertEqual(normalize_person_name('Renée', "O'Brien"), 'reneeobrien') + def test_acute_accents_and_cedilla_are_folded_not_dropped(self): + # Regression for the incomplete hand-rolled accent map (#1275): acute + # accents and the cedilla were dropped, mangling keys (cludiosilva) and + # hiding accented-name duplicates. NFKD folds them to ASCII. + self.assertEqual(normalize_person_name('Cláudio', 'Silva'), 'claudiosilva') + self.assertEqual(normalize_person_name('Edgar', 'Martínez'), 'edgarmartinez') + self.assertEqual(normalize_person_name('François', 'Guimbretière'), + 'francoisguimbretiere') + + def test_accented_namesakes_share_a_key(self): + # The whole point: "Cláudio Silva" and "Claudio Silva" must cluster. + self.assertEqual( + normalize_person_name('Cláudio', 'Silva'), + normalize_person_name('Claudio', 'Silva'), + ) + def test_handles_none(self): self.assertEqual(normalize_person_name(None, 'Zhang'), 'zhang') diff --git a/website/utils/name_utils.py b/website/utils/name_utils.py index ca953410..1aafb4e1 100644 --- a/website/utils/name_utils.py +++ b/website/utils/name_utils.py @@ -3,23 +3,33 @@ The primary use is producing a stable, accent-folded key for a person's name so that records that *would* collide on ``Person.url_name`` can be -detected and clustered. The logic mirrors the url_name derivation in -``Person.save()`` (website/models/person.py) so callers — e.g. the -duplicate-people data-health check and a future ``recompute_url_names`` -command — agree on what "the same name" means. +detected and clustered. ``Person.save()`` derives ``url_name`` through these +helpers (via ``build_unique_url_name``), so the model, the duplicate-people +data-health check, and the ``recompute_url_names`` command all agree on what +"the same name" means. """ import re +import unicodedata + + +def _ascii_fold(text): + """ + Fold accented Latin characters to their ASCII base via Unicode NFKD + decomposition (e.g. ``á`` -> ``a``, ``ç`` -> ``c``, ``ñ`` -> ``n``), then + drop the combining marks. + + This replaces an earlier hand-maintained accent map that only covered + grave/circumflex/tilde vowels — it silently *dropped* acute accents and + the cedilla, mangling url_names (``Cláudio`` -> ``cludio``) and hiding + accented-name duplicates from the dedup check. NFKD handles the whole Latin + range generically, so the map no longer has to be kept in sync by hand. + """ + return ''.join( + c for c in unicodedata.normalize('NFKD', text) + if not unicodedata.combining(c) + ) -# Common accented characters mapped to ASCII. Kept in sync with the map in -# website/models/person.py used by Person.save() for url_name derivation. -SPECIAL_CHARS = { - 'ã': 'a', 'à': 'a', 'â': 'a', - 'é': 'e', 'è': 'e', 'ê': 'e', - 'ñ': 'n', 'ń': 'n', - 'ö': 'o', 'ô': 'o', - 'û': 'u', 'ü': 'u', 'ù': 'u', -} # Substrings identifying the auto-assigned "Star Wars" placeholder images # (see get_path_to_random_starwars_image / get_upload_to_for_person_easter_egg). @@ -43,12 +53,11 @@ def normalize_person_name(first_name, last_name): 'jonfroehlich' >>> normalize_person_name('Renée', "O'Brien") 'reneeobrien' + >>> normalize_person_name('Cláudio', 'Silva') + 'claudiosilva' """ - cleaned = f"{first_name or ''}{last_name or ''}".lower() - for c in cleaned: - if re.search('[^a-zA-Z]', c) and c in SPECIAL_CHARS: - cleaned = cleaned.replace(c, SPECIAL_CHARS[c]) - return re.sub('[^a-zA-Z]', '', cleaned) + cleaned = _ascii_fold(f"{first_name or ''}{last_name or ''}".lower()) + return re.sub('[^a-z]', '', cleaned) def build_unique_url_name(first_name, middle_name, last_name, is_taken): @@ -89,9 +98,7 @@ def build_unique_url_name(first_name, middle_name, last_name, is_taken): # the same way url_name derivation folds accents, then drop anything non-alpha. middle = (middle_name or '').strip() if middle: - initial = middle[0].lower() - initial = SPECIAL_CHARS.get(initial, initial) - initial = re.sub('[^a-z]', '', initial) + initial = re.sub('[^a-z]', '', _ascii_fold(middle[0].lower())) if initial: first_key = normalize_person_name(first_name, '') last_key = normalize_person_name('', last_name) From 6d81b57f48c5b950ce60dc376808ff1f51c4aaed Mon Sep 17 00:00:00 2001 From: Jon Froehlich Date: Wed, 17 Jun 2026 17:06:14 -0700 Subject: [PATCH 4/5] feat(people): guard merge_duplicate_people against name-mismatched ids (#1275) The decisions file is keyed by production ids, but pushing to master deploys to the test server (a different DB where those ids are other people). Refuse to merge two rows whose normalized names differ, unless --allow-name-mismatch is passed (for a deliberate documented cross-name same-person case). Stops a prod-id file from corrupting test data if it ever runs there. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../commands/merge_duplicate_people.py | 26 +++++++++++++++++++ website/tests/test_merge_duplicate_people.py | 24 +++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/website/management/commands/merge_duplicate_people.py b/website/management/commands/merge_duplicate_people.py index 3d83f993..a5cc23cd 100644 --- a/website/management/commands/merge_duplicate_people.py +++ b/website/management/commands/merge_duplicate_people.py @@ -42,6 +42,7 @@ from django.db import transaction from website.models import Person +from website.utils.name_utils import normalize_person_name _logger = logging.getLogger(__name__) @@ -101,10 +102,18 @@ def add_arguments(self, parser): '--apply', action='store_true', help='Actually perform the merges/deletes. Without this flag the command is a dry-run.', ) + parser.add_argument( + '--allow-name-mismatch', action='store_true', + help=('Permit merging two rows whose normalized names differ (e.g. a ' + 'documented cross-name same-person case). By default such a row ' + 'is refused — this guards against running a prod-id decisions ' + 'file against the wrong database (e.g. the test server).'), + ) def handle(self, *args, **options): decisions = self._read_decisions(options['decisions']) apply = options['apply'] + self.allow_name_mismatch = options['allow_name_mismatch'] mode = 'APPLY' if apply else 'DRY-RUN' self.stdout.write(f"=== merge_duplicate_people [{mode}] — {len(decisions)} decision(s) ===") @@ -173,6 +182,23 @@ def _handle_merge(self, row, apply): f" ERROR source and target are the same person ({source.pk}) — skipping")) return + # Safety guard: refuse to merge two rows whose normalized names differ. + # The decisions file is keyed by prod ids; this stops a prod-id file from + # silently merging unrelated people if run against the wrong database + # (e.g. the test server, where id 328 is someone else entirely). + if not self.allow_name_mismatch: + src_key = normalize_person_name(source.first_name, source.last_name) + tgt_key = normalize_person_name(target.first_name, target.last_name) + if src_key != tgt_key: + self.stdout.write(self.style.ERROR( + f" ERROR name mismatch: {source.pk} ({source.get_full_name()!r}) vs " + f"{target.pk} ({target.get_full_name()!r}) — skipping. Pass " + f"--allow-name-mismatch for a deliberate cross-name merge.")) + _logger.error( + "merge_duplicate_people: refused name-mismatch merge %s -> %s", + source.pk, target.pk) + return + if not apply: self.stdout.write( f" merge {source.pk} ({source.get_full_name()}, refs={count_references(source)}) " diff --git a/website/tests/test_merge_duplicate_people.py b/website/tests/test_merge_duplicate_people.py index 2e5477d2..26dc30c9 100644 --- a/website/tests/test_merge_duplicate_people.py +++ b/website/tests/test_merge_duplicate_people.py @@ -164,6 +164,30 @@ def test_second_apply_is_a_noop(self): self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) self.assertTrue(Person.objects.filter(pk=self.target.pk).exists()) + def test_name_mismatch_is_refused_by_default(self): + # A prod-id decisions file run against the wrong DB would pair unrelated + # people; the name guard must refuse rather than corrupt data. + other = self.make_person("Totally", "Different") + news = self.make_news_item(author=self.source) + path = _decisions_file([{ + 'source_id': self.source.pk, 'action': 'merge', + 'target_id': other.pk, 'note': 'mismatch'}]) + call_command('merge_duplicate_people', decisions=path, apply=True) + # Refused: source survives, its relation untouched. + self.assertTrue(Person.objects.filter(pk=self.source.pk).exists()) + news.refresh_from_db() + self.assertEqual(news.author, self.source) + + def test_name_mismatch_allowed_with_flag(self): + other = self.make_person("Totally", "Different") + self.make_news_item(author=self.source) + path = _decisions_file([{ + 'source_id': self.source.pk, 'action': 'merge', + 'target_id': other.pk, 'note': 'cross-name'}]) + call_command('merge_duplicate_people', decisions=path, apply=True, + allow_name_mismatch=True) + self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) + def test_delete_action_refuses_when_refs_exist(self): self.make_news_item(author=self.source) path = _decisions_file([{ From b752d606068a103f0766dbc30aed2ef7efaa0512 Mon Sep 17 00:00:00 2001 From: Jon Froehlich Date: Wed, 17 Jun 2026 17:14:38 -0700 Subject: [PATCH 5/5] chore(people): wire one-shot prod merge of duplicate Person records (#1275) Add the reviewed dedup_decisions.csv (13 merges + 2 jasminezhang keeps; ids only) and a PROD-gated one-shot in docker-entrypoint.sh that runs merge_duplicate_people --apply before recompute_url_names. Gated to DJANGO_ENV=PROD (the decisions file is keyed by production ids) with the command's name-mismatch guard as a backstop. Validated end-to-end against a copy of the prod dump: 589->576 people, url_name collisions 10->0, duplicate clusters reduced to the two intended namesakes. Remove this one-shot in a follow-up once the first prod deploy confirms the merge in /logs (the command is idempotent, so a lingering run is a no-op). Co-Authored-By: Claude Opus 4.8 (1M context) --- dedup_decisions.csv | 16 ++++++++++++++++ docker-entrypoint.sh | 13 ++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 dedup_decisions.csv diff --git a/dedup_decisions.csv b/dedup_decisions.csv new file mode 100644 index 00000000..e4e3fd4f --- /dev/null +++ b/dedup_decisions.csv @@ -0,0 +1,16 @@ +source_id,action,target_id,note +328,merge,564,jennifermankoff: keep 564 (UW email + more refs) +624,merge,498,jasonyip: keep 498 (7 refs vs 1) +649,merge,652,hangdo: keep 652 (richer; 649 has 1 ref) +719,merge,718,qingshen: 719 is a 0-ref shell +709,merge,710,claudinadegyves: 709 is a 0-ref shell +626,merge,625,jiahaoli: 626 is a 0-ref shell +627,merge,625,jiahaoli: 627 is a 0-ref shell +643,merge,644,mikesinclair: 643 is a 0-ref shell +645,merge,644,mikesinclair: 645 is a 0-ref shell +723,merge,670,erictokuda: keep 670 (older; both refs=1) +724,merge,707,heatherfeldner: keep 707 (older; both refs=1) +581,merge,623,wendyroldan: keep 623 (real headshot; scalar fields backfill from 581) +720,merge,669,claudiosilva: revealed by the NFKD accent fix; keep 669 (correct spelling) +840,keep,,jasminezhang: genuine namesake - leave separate (#1206) +869,keep,,jasminezhang: genuine namesake - leave separate (#1206) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 3b324335..64a208ea 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -116,7 +116,18 @@ echo "******************************************" python manage.py backfill_project_visibility echo "****************** STEP 4.8/5: docker-entrypoint.sh ************************" -echo "4.8 Running 'python manage.py recompute_url_names' to de-collide historical url_names (#1206)" +echo "4.8 Running 'python manage.py merge_duplicate_people' (PROD only) to consolidate duplicate Person records (#1275)" +echo "******************************************" +# ONE-SHOT (remove after the first prod run confirms in /logs). Gated to PROD +# because dedup_decisions.csv is keyed by production ids; the merge command also +# refuses name-mismatched pairs as a backstop. Idempotent, so a lingering run is +# a harmless no-op once the duplicates are already merged. +if [ "$DJANGO_ENV" = "PROD" ]; then + python manage.py merge_duplicate_people --decisions dedup_decisions.csv --apply +fi + +echo "****************** STEP 4.9/5: docker-entrypoint.sh ************************" +echo "4.9 Running 'python manage.py recompute_url_names' to de-collide historical url_names (#1206)" echo "******************************************" python manage.py recompute_url_names