diff --git a/dedup_decisions.csv b/dedup_decisions.csv new file mode 100644 index 00000000..e4e3fd4f --- /dev/null +++ b/dedup_decisions.csv @@ -0,0 +1,16 @@ +source_id,action,target_id,note +328,merge,564,jennifermankoff: keep 564 (UW email + more refs) +624,merge,498,jasonyip: keep 498 (7 refs vs 1) +649,merge,652,hangdo: keep 652 (richer; 649 has 1 ref) +719,merge,718,qingshen: 719 is a 0-ref shell +709,merge,710,claudinadegyves: 709 is a 0-ref shell +626,merge,625,jiahaoli: 626 is a 0-ref shell +627,merge,625,jiahaoli: 627 is a 0-ref shell +643,merge,644,mikesinclair: 643 is a 0-ref shell +645,merge,644,mikesinclair: 645 is a 0-ref shell +723,merge,670,erictokuda: keep 670 (older; both refs=1) +724,merge,707,heatherfeldner: keep 707 (older; both refs=1) +581,merge,623,wendyroldan: keep 623 (real headshot; scalar fields backfill from 581) +720,merge,669,claudiosilva: revealed by the NFKD accent fix; keep 669 (correct spelling) +840,keep,,jasminezhang: genuine namesake - leave separate (#1206) +869,keep,,jasminezhang: genuine namesake - leave separate (#1206) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index e27fa2ff..64a208ea 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -115,6 +115,22 @@ echo "4.7 Running 'python manage.py backfill_project_visibility' to resolve is_v echo "******************************************" python manage.py backfill_project_visibility +echo "****************** STEP 4.8/5: docker-entrypoint.sh ************************" +echo "4.8 Running 'python manage.py merge_duplicate_people' (PROD only) to consolidate duplicate Person records (#1275)" +echo "******************************************" +# ONE-SHOT (remove after the first prod run confirms in /logs). Gated to PROD +# because dedup_decisions.csv is keyed by production ids; the merge command also +# refuses name-mismatched pairs as a backstop. Idempotent, so a lingering run is +# a harmless no-op once the duplicates are already merged. +if [ "$DJANGO_ENV" = "PROD" ]; then + python manage.py merge_duplicate_people --decisions dedup_decisions.csv --apply +fi + +echo "****************** STEP 4.9/5: docker-entrypoint.sh ************************" +echo "4.9 Running 'python manage.py recompute_url_names' to de-collide historical url_names (#1206)" +echo "******************************************" +python manage.py recompute_url_names + # echo "****************** STEP 4.3/5: docker-entrypoint.sh ************************" # echo "4.3 Running 'python manage.py rename_person_images' to rename person images" # echo "******************************************" diff --git a/website/management/commands/merge_duplicate_people.py b/website/management/commands/merge_duplicate_people.py new file mode 100644 index 00000000..a5cc23cd --- /dev/null +++ b/website/management/commands/merge_duplicate_people.py @@ -0,0 +1,299 @@ +""" +Merge duplicate ``Person`` records, driven by a reviewed decisions file. + +Part of the issue #1275 dedup work. Over ~15 years the database accumulated +duplicate ``Person`` rows (one human, several rows from un-deduped imports). +This command consolidates each cluster into one canonical record, relocating +**every** related object (publications, talks, posters, grants, awards, +positions, project roles, news, and the advisor/co-advisor/grad-mentor +self-references) before deleting the now-empty duplicate. + +Why a management command (not a data migration): migrations are gitignored and +regenerated per-environment here, so the established pattern is a one-shot +command verified via the logs (see ``generate_slugs_for_old_news_items`` etc.). + +Safety model: + * **Dry-run by default.** Without ``--apply`` it prints the plan and changes + nothing. Pass ``--apply`` to actually mutate. + * **Idempotent.** A ``merge`` whose source row is already gone is a no-op, so + re-running an applied decisions file is safe. + * **Atomic per row.** Each merge runs in ``transaction.atomic()``. + * **Generic relation walk.** Relations are discovered via + ``Person._meta.get_fields()`` rather than hardcoded, so a new FK/M2M to + Person can't silently orphan data. + +Decisions file: CSV with columns ``source_id, action, target_id, note``. + * ``merge`` — relocate everything from ``source_id`` onto ``target_id``, then + delete ``source_id``. + * ``delete`` — delete ``source_id`` (refused unless it has zero references). + * ``keep`` — no-op; documents a reviewed namesake to leave alone. +Ids only (no emails) so the reviewed file is safe to commit to the public repo; +member names are already public. + +Usage: + python manage.py merge_duplicate_people --decisions dedup_decisions.csv + python manage.py merge_duplicate_people --decisions dedup_decisions.csv --apply +""" + +import csv +import logging + +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction + +from website.models import Person +from website.utils.name_utils import normalize_person_name + +_logger = logging.getLogger(__name__) + +# Scalar (non-relational) fields whose *blank* value on the target is backfilled +# from the source during a merge. We never overwrite a populated target field, +# and we deliberately leave the image/cropping fields and the derived url_name / +# bio_datetime_modified alone (the target is the chosen canonical record; its +# headshot stays put — see module docstring / issue #1275). +SCALAR_BACKFILL_FIELDS = [ + 'email', 'personal_website', 'github', 'twitter', 'linkedin', + 'mastodon', 'threads', 'bluesky', 'bio', 'next_position', + 'next_position_url', +] + + +def _person_reverse_relations(): + """ + Return ``(fk_rels, m2m_rels)`` — the auto-created reverse relations that + point at ``Person``, discovered generically from the model meta. + + ``fk_rels`` are reverse foreign keys / one-to-ones (``ManyToOneRel`` / + ``OneToOneRel``), including the three Position advisor self-references. + ``m2m_rels`` are reverse many-to-manys (``ManyToManyRel``), e.g. the sorted + ``authors`` / ``recipients`` sets and the plain ``News.people`` set. + """ + fk_rels, m2m_rels = [], [] + for field in Person._meta.get_fields(): + if not (field.is_relation and field.auto_created and not field.concrete): + continue + if field.many_to_many: + m2m_rels.append(field) + elif field.one_to_many or field.one_to_one: + fk_rels.append(field) + return fk_rels, m2m_rels + + +def count_references(person): + """Total number of objects across all relations pointing at ``person``.""" + fk_rels, m2m_rels = _person_reverse_relations() + total = 0 + for field in fk_rels: + total += field.related_model.objects.filter(**{field.field.name: person}).count() + for field in m2m_rels: + total += getattr(person, field.get_accessor_name()).count() + return total + + +class Command(BaseCommand): + help = 'Merge duplicate Person records from a reviewed decisions CSV (dry-run by default).' + + def add_arguments(self, parser): + parser.add_argument( + '--decisions', required=True, + help='Path to the decisions CSV (columns: source_id, action, target_id, note).', + ) + parser.add_argument( + '--apply', action='store_true', + help='Actually perform the merges/deletes. Without this flag the command is a dry-run.', + ) + parser.add_argument( + '--allow-name-mismatch', action='store_true', + help=('Permit merging two rows whose normalized names differ (e.g. a ' + 'documented cross-name same-person case). By default such a row ' + 'is refused — this guards against running a prod-id decisions ' + 'file against the wrong database (e.g. the test server).'), + ) + + def handle(self, *args, **options): + decisions = self._read_decisions(options['decisions']) + apply = options['apply'] + self.allow_name_mismatch = options['allow_name_mismatch'] + + mode = 'APPLY' if apply else 'DRY-RUN' + self.stdout.write(f"=== merge_duplicate_people [{mode}] — {len(decisions)} decision(s) ===") + + for row in decisions: + action = row['action'] + if action == 'merge': + self._handle_merge(row, apply) + elif action == 'delete': + self._handle_delete(row, apply) + elif action == 'keep': + self.stdout.write(f" keep source={row['source_id']} (namesake / leave alone){self._note(row)}") + else: + self.stdout.write(self.style.WARNING( + f" SKIP unknown action {action!r} for source={row['source_id']}")) + + if not apply: + self.stdout.write(self.style.WARNING( + "\nDry-run only — no changes made. Re-run with --apply to perform these actions.")) + + # ----- decisions parsing ------------------------------------------------- + + def _read_decisions(self, path): + """Parse and validate the decisions CSV into a list of normalized rows.""" + try: + with open(path, newline='', encoding='utf-8') as fh: + rows = list(csv.DictReader(fh)) + except OSError as exc: + raise CommandError(f"Could not read decisions file {path!r}: {exc}") + + required = {'source_id', 'action'} + if not rows or not required.issubset({k.strip() for k in rows[0].keys()}): + raise CommandError( + "Decisions CSV must have a header row with at least " + "'source_id' and 'action' columns (plus 'target_id', 'note').") + + cleaned = [] + for i, raw in enumerate(rows, start=2): # row 1 is the header + row = {(k or '').strip(): (v or '').strip() for k, v in raw.items()} + if not row.get('source_id'): + continue # allow blank spacer rows + row['action'] = row.get('action', '').lower() + if row['action'] == 'merge' and not row.get('target_id'): + raise CommandError(f"Row {i}: action 'merge' requires a target_id.") + cleaned.append(row) + return cleaned + + @staticmethod + def _note(row): + return f" # {row['note']}" if row.get('note') else '' + + # ----- actions ----------------------------------------------------------- + + def _handle_merge(self, row, apply): + source = self._get_person(row['source_id']) + if source is None: + self.stdout.write(f" no-op source={row['source_id']} already gone (merged?){self._note(row)}") + return + target = self._get_person(row['target_id']) + if target is None: + self.stdout.write(self.style.ERROR( + f" ERROR target={row['target_id']} not found for source={row['source_id']} — skipping")) + return + if source.pk == target.pk: + self.stdout.write(self.style.ERROR( + f" ERROR source and target are the same person ({source.pk}) — skipping")) + return + + # Safety guard: refuse to merge two rows whose normalized names differ. + # The decisions file is keyed by prod ids; this stops a prod-id file from + # silently merging unrelated people if run against the wrong database + # (e.g. the test server, where id 328 is someone else entirely). + if not self.allow_name_mismatch: + src_key = normalize_person_name(source.first_name, source.last_name) + tgt_key = normalize_person_name(target.first_name, target.last_name) + if src_key != tgt_key: + self.stdout.write(self.style.ERROR( + f" ERROR name mismatch: {source.pk} ({source.get_full_name()!r}) vs " + f"{target.pk} ({target.get_full_name()!r}) — skipping. Pass " + f"--allow-name-mismatch for a deliberate cross-name merge.")) + _logger.error( + "merge_duplicate_people: refused name-mismatch merge %s -> %s", + source.pk, target.pk) + return + + if not apply: + self.stdout.write( + f" merge {source.pk} ({source.get_full_name()}, refs={count_references(source)}) " + f"-> {target.pk} ({target.get_full_name()}, refs={count_references(target)})" + f"{self._note(row)}") + return + + with transaction.atomic(): + summary, backfilled = self._merge(source, target) + moved = ', '.join(f"{k}={v}" for k, v in summary.items()) or 'no related objects' + backfill = f"; backfilled {', '.join(backfilled)}" if backfilled else '' + msg = f"merged {row['source_id']} into {target.pk}: {moved}{backfill}" + self.stdout.write(self.style.SUCCESS(f" {msg}")) + _logger.info("merge_duplicate_people: %s", msg) + + def _handle_delete(self, row, apply): + source = self._get_person(row['source_id']) + if source is None: + self.stdout.write(f" no-op source={row['source_id']} already gone{self._note(row)}") + return + refs = count_references(source) + if refs != 0: + self.stdout.write(self.style.ERROR( + f" ERROR refusing to delete {source.pk} ({source.get_full_name()}) — " + f"has {refs} reference(s); merge it instead")) + return + if not apply: + self.stdout.write(f" delete {source.pk} ({source.get_full_name()}, refs=0){self._note(row)}") + return + source.delete() + self.stdout.write(self.style.SUCCESS(f" deleted {row['source_id']} (was a 0-ref shell)")) + _logger.info("merge_duplicate_people: deleted shell %s", row['source_id']) + + # ----- core merge -------------------------------------------------------- + + def _merge(self, source, target): + """ + Relocate every relation from ``source`` onto ``target``, backfill blank + scalar fields, then delete ``source``. Returns ``(summary, backfilled)``. + Must run inside an atomic block. + """ + fk_rels, m2m_rels = _person_reverse_relations() + summary = {} + + # Reverse FKs (incl. advisor / co_advisor / grad_mentor self-refs): just + # repoint the foreign key. SET_NULL/CASCADE on_delete is irrelevant once + # nothing points at source. + for field in fk_rels: + model, fk = field.related_model, field.field.name + n = model.objects.filter(**{fk: source}).update(**{fk: target}) + if n: + summary[f"{model.__name__}.{fk}"] = n + + # Reverse M2Ms: rebuild each related object's ordered author/recipient/ + # people list with source swapped for target (dropping source if target + # is already present — dedup). For SortedManyToManyField, .set() assigns + # sort_value by list position, so author order is preserved. + for field in m2m_rels: + forward = field.field.name # 'authors' / 'recipients' / 'people' + moved = 0 + for obj in list(getattr(source, field.get_accessor_name()).all()): + manager = getattr(obj, forward) + new_members, seen = [], set() + for member in manager.all(): # ordered for sorted M2M + replacement = target if member.pk == source.pk else member + if replacement.pk not in seen: + seen.add(replacement.pk) + new_members.append(replacement) + manager.set(new_members) + moved += 1 + if moved: + summary[f"{field.related_model.__name__}.{forward}"] = moved + + # Scalar backfill: fill target's blanks from source; never overwrite. + backfilled = [] + for fld in SCALAR_BACKFILL_FIELDS: + if not getattr(target, fld) and getattr(source, fld): + setattr(target, fld, getattr(source, fld)) + backfilled.append(fld) + if backfilled: + # Bypass Person.save() (avoids url_name recompute + Star Wars image + # side effects); we only want the scalar columns written. + Person.objects.filter(pk=target.pk).update( + **{fld: getattr(target, fld) for fld in backfilled}) + + # Delete the now-orphaned source. Its pre_delete signal removes source's + # own image file (target's image is untouched, so nothing shared breaks). + source.delete() + return summary, backfilled + + @staticmethod + def _get_person(pk): + if not pk: + return None + try: + return Person.objects.get(pk=int(pk)) + except (Person.DoesNotExist, ValueError): + return None diff --git a/website/management/commands/recompute_url_names.py b/website/management/commands/recompute_url_names.py new file mode 100644 index 00000000..539d2c4e --- /dev/null +++ b/website/management/commands/recompute_url_names.py @@ -0,0 +1,73 @@ +""" +Recompute every Person.url_name so historical collisions de-collide (#1206). + +Person.url_name is derived in Person.save(), but the collision-resolution loop +only protects rows saved *after* it landed. Rows imported/created earlier and +never re-saved can still share a bare url_name, which causes +MultipleObjectsReturned -> HTTP 500 on /member//. This command +re-derives a unique url_name for every person using the same shared logic +(website.utils.name_utils.build_unique_url_name), giving namesakes stable, +readable URLs (a middle-initial differentiator where possible, else a numeric +suffix). + +People are processed in ascending pk order, so within a same-name cluster the +earliest record keeps the bare url_name (e.g. ``jasminezhang``) and later ones +get differentiated (``jasminexzhang`` / ``jasminezhang2``). Assignment is +deterministic, so the command is idempotent — re-running changes nothing. + +Writes use ``.update()`` to set only the url_name column, deliberately bypassing +Person.save() (no Star Wars image fallback, no bio_datetime_modified churn). +Runs on every container start (docker-entrypoint.sh) and is safe to re-run. + +Usage: + python manage.py recompute_url_names # apply + python manage.py recompute_url_names --dry-run # preview changes only +""" + +import logging + +from django.core.management.base import BaseCommand + +from website.models import Person +from website.utils.name_utils import build_unique_url_name + +_logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Re-derive a unique url_name for every Person (de-collides historical duplicates, #1206)." + + def add_arguments(self, parser): + parser.add_argument( + '--dry-run', action='store_true', + help='Report what would change without writing anything.', + ) + + def handle(self, *args, **options): + dry_run = options['dry_run'] + assigned = set() + changes = [] # (pk, old, new) for rows whose url_name actually changes + + # Ascending pk = creation order, so the earliest row in a name cluster + # keeps the bare url_name and later namesakes get differentiated. + for person in Person.objects.order_by('pk').only( + 'pk', 'first_name', 'middle_name', 'last_name', 'url_name'): + new_url_name = build_unique_url_name( + person.first_name, person.middle_name, person.last_name, + is_taken=assigned.__contains__, + ) + assigned.add(new_url_name) + if new_url_name != person.url_name: + changes.append((person.pk, person.url_name, new_url_name)) + + for pk, old, new in changes: + _logger.debug("recompute_url_names: %s -> %s (pk=%s)", old, new, pk) + self.stdout.write(f" {old or '(blank)'} -> {new} (pk={pk})") + if not dry_run: + Person.objects.filter(pk=pk).update(url_name=new) + + verb = 'would change' if dry_run else 'changed' + summary = f"recompute_url_names: {verb} {len(changes)} url_name(s)." + self.stdout.write(self.style.SUCCESS(summary)) + if not dry_run: + _logger.info(summary) diff --git a/website/models/person.py b/website/models/person.py index ee3fdc25..1a17a5cd 100644 --- a/website/models/person.py +++ b/website/models/person.py @@ -7,6 +7,7 @@ from django.core.files import File import website.utils.fileutils as ml_fileutils from website.utils.upload_validators import validate_image_upload +from website.utils.name_utils import build_unique_url_name from django.db.models.functions import Coalesce from django.conf import settings @@ -22,7 +23,6 @@ import os # for joining paths from uuid import uuid4 # for generating unique filenames -import re from datetime import date, timedelta from image_cropping import ImageRatioField @@ -36,15 +36,6 @@ # This retrieves a Python logging instance (or creates it) _logger = logging.getLogger(__name__) -# Special character mappings -special_chars = { - 'ã': 'a', 'à': 'a', 'â': 'a', - 'é': 'e', 'è': 'e', 'ê': 'e', - 'ñ': 'n', 'ń': 'n', - 'ö': 'o', 'ô': 'o', - 'û': 'u', 'ü': 'u', 'ù': 'u' -} - PERSON_THUMBNAIL_SIZE = (245, 245) def get_unique_filename_for_person(person, original_filename, append_str=None, force_unique=True): @@ -638,29 +629,17 @@ def __str__(self): def save(self, *args, **kwargs): - # First, automatically set the url_name field - # Substitute any common special characters. I haven't found a better automatic way to do - # this, so we are manually mapping 'common' special characters. - url_name_cleaned = (self.first_name + self.last_name).lower() - for c in url_name_cleaned: - if bool(re.search('[^a-zA-Z]', c)) and c in special_chars: - url_name_cleaned = url_name_cleaned.replace(c, special_chars.get(c)) - - # Clean remaining characters (EX: dashes, periods). - url_name_cleaned = re.sub('[^a-zA-Z]', '', url_name_cleaned) - - # Check for collisions and append numeric suffix if needed - # We exclude the current person (by pk) when checking for duplicates to allow updates - base_url_name = url_name_cleaned - counter = 2 - - # Keep incrementing counter until we find a unique url_name - while Person.objects.filter(url_name=url_name_cleaned).exclude(pk=self.pk).exists(): - url_name_cleaned = f"{base_url_name}{counter}" - counter += 1 - _logger.debug(f"URL name collision detected for {self.get_full_name()}. Trying {url_name_cleaned}") - - self.url_name = url_name_cleaned + # Automatically derive a unique url_name. The accent-folding + collision + # logic lives in build_unique_url_name (shared with recompute_url_names): + # it prefers the bare key (jonfroehlich), then a readable middle-initial + # differentiator for namesakes (jasminexzhang), then a numeric suffix + # (jasminezhang2). We exclude this row by pk so re-saving keeps its name. + self.url_name = build_unique_url_name( + self.first_name, self.middle_name, self.last_name, + is_taken=lambda candidate: ( + Person.objects.filter(url_name=candidate).exclude(pk=self.pk).exists() + ), + ) # Next, automatically set the bio_date_modified field if self.pk is not None: # checks if this is an existing object diff --git a/website/tests/test_merge_duplicate_people.py b/website/tests/test_merge_duplicate_people.py new file mode 100644 index 00000000..26dc30c9 --- /dev/null +++ b/website/tests/test_merge_duplicate_people.py @@ -0,0 +1,198 @@ +""" +Regression tests for the merge_duplicate_people management command (#1275). + +Verifies that merging a duplicate Person B into a canonical Person A relocates +EVERY relation type (FK, the advisor self-references, sorted-M2M author sets, +plain M2M), preserves publication author order, backfills only-blank scalar +fields, and is both dry-run-safe and idempotent. +""" + +import csv +import tempfile +from datetime import date + +from django.core.management import call_command + +from website.models import ( + Award, + Grant, + News, + Person, + Position, + ProjectRole, + Sponsor, +) +from website.models.position import Title +from website.tests.base import DatabaseTestCase + + +def _decisions_file(rows): + """Write a decisions CSV to a temp path and return it.""" + fh = tempfile.NamedTemporaryFile( + mode='w', suffix='.csv', delete=False, newline='', encoding='utf-8') + writer = csv.DictWriter(fh, fieldnames=['source_id', 'action', 'target_id', 'note']) + writer.writeheader() + for row in rows: + writer.writerow(row) + fh.close() + return fh.name + + +class MergeDuplicatePeopleTests(DatabaseTestCase): + def setUp(self): + # A = canonical/target, B = duplicate/source. + self.target = self.make_person("Jennifer", "Mankoff") + self.source = self.make_person("Jennifer", "Mankoff") + + def _merge(self, apply=True, action='merge', target_id=None): + path = _decisions_file([{ + 'source_id': self.source.pk, + 'action': action, + 'target_id': target_id if target_id is not None else self.target.pk, + 'note': 'test', + }]) + call_command('merge_duplicate_people', decisions=path, apply=apply) + + # ----- FK + advisor self-ref reassignment -------------------------------- + + def test_merge_reassigns_fk_and_advisor_self_refs(self): + other = self.make_person("Some", "Advisee") + # B is the person on one position, and the advisor on another. + own = Position.objects.create( + person=self.source, start_date=date(2020, 1, 1), title=Title.PHD_STUDENT) + advised = Position.objects.create( + person=other, advisor=self.source, co_advisor=self.source, + grad_mentor=self.source, start_date=date(2020, 1, 1), title=Title.PHD_STUDENT) + project = self.make_project("Proj") + role = ProjectRole.objects.create( + person=self.source, project=project, start_date=date(2020, 1, 1)) + news = self.make_news_item(author=self.source) + + self._merge() + + self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) + own.refresh_from_db() + advised.refresh_from_db() + role.refresh_from_db() + news.refresh_from_db() + self.assertEqual(own.person, self.target) + self.assertEqual(advised.advisor, self.target) + self.assertEqual(advised.co_advisor, self.target) + self.assertEqual(advised.grad_mentor, self.target) + self.assertEqual(role.person, self.target) + self.assertEqual(news.author, self.target) + + # ----- sorted-M2M order preservation + dedup ----------------------------- + + def test_merge_preserves_publication_author_order(self): + first = self.make_person("Aa", "Author") + last = self.make_person("Zz", "Author") + pub = self.make_publication() + pub.authors.set([first, self.source, last]) # B sits in the middle slot + + self._merge() + + pub.refresh_from_db() + # A takes B's exact position; order otherwise unchanged. + self.assertEqual(list(pub.authors.all()), [first, self.target, last]) + + def test_merge_dedupes_when_target_already_coauthor(self): + pub = self.make_publication() + pub.authors.set([self.source, self.target]) # both already on the paper + + self._merge() + + pub.refresh_from_db() + self.assertEqual(list(pub.authors.all()), [self.target]) + + def test_merge_moves_award_recipients_and_grant_authors(self): + award = Award.objects.create(title="Best Paper", date=date(2020, 1, 1)) + award.recipients.set([self.source]) + grant = Grant.objects.create( + sponsor=Sponsor.objects.create(name="NSF"), + title="A Grant", date=date(2020, 1, 1)) + grant.authors.set([self.source]) + + self._merge() + + award.refresh_from_db() + grant.refresh_from_db() + self.assertEqual(list(award.recipients.all()), [self.target]) + self.assertEqual(list(grant.authors.all()), [self.target]) + + def test_merge_moves_plain_news_people_m2m(self): + news = self.make_news_item() + news.people.set([self.source]) + + self._merge() + + news.refresh_from_db() + self.assertEqual(list(news.people.all()), [self.target]) + + # ----- scalar backfill --------------------------------------------------- + + def test_scalar_backfill_fills_blanks_only(self): + Person.objects.filter(pk=self.target.pk).update( + email="keep@uw.edu", github="") + Person.objects.filter(pk=self.source.pk).update( + email="lose@uw.edu", github="sourcehub") + + self._merge() + + self.target.refresh_from_db() + self.assertEqual(self.target.email, "keep@uw.edu") # populated: untouched + self.assertEqual(self.target.github, "sourcehub") # blank: backfilled + + # ----- safety: dry-run + idempotency ------------------------------------- + + def test_dry_run_makes_no_changes(self): + pub = self.make_publication() + pub.authors.set([self.source]) + + self._merge(apply=False) + + self.assertTrue(Person.objects.filter(pk=self.source.pk).exists()) + pub.refresh_from_db() + self.assertEqual(list(pub.authors.all()), [self.source]) + + def test_second_apply_is_a_noop(self): + self.make_news_item(author=self.source) + self._merge() + self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) + # Re-running the same decisions file must not raise and must change nothing. + self._merge() + self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) + self.assertTrue(Person.objects.filter(pk=self.target.pk).exists()) + + def test_name_mismatch_is_refused_by_default(self): + # A prod-id decisions file run against the wrong DB would pair unrelated + # people; the name guard must refuse rather than corrupt data. + other = self.make_person("Totally", "Different") + news = self.make_news_item(author=self.source) + path = _decisions_file([{ + 'source_id': self.source.pk, 'action': 'merge', + 'target_id': other.pk, 'note': 'mismatch'}]) + call_command('merge_duplicate_people', decisions=path, apply=True) + # Refused: source survives, its relation untouched. + self.assertTrue(Person.objects.filter(pk=self.source.pk).exists()) + news.refresh_from_db() + self.assertEqual(news.author, self.source) + + def test_name_mismatch_allowed_with_flag(self): + other = self.make_person("Totally", "Different") + self.make_news_item(author=self.source) + path = _decisions_file([{ + 'source_id': self.source.pk, 'action': 'merge', + 'target_id': other.pk, 'note': 'cross-name'}]) + call_command('merge_duplicate_people', decisions=path, apply=True, + allow_name_mismatch=True) + self.assertFalse(Person.objects.filter(pk=self.source.pk).exists()) + + def test_delete_action_refuses_when_refs_exist(self): + self.make_news_item(author=self.source) + path = _decisions_file([{ + 'source_id': self.source.pk, 'action': 'delete', + 'target_id': '', 'note': 'shell?'}]) + call_command('merge_duplicate_people', decisions=path, apply=True) + # Has a reference, so delete is refused — source survives. + self.assertTrue(Person.objects.filter(pk=self.source.pk).exists()) diff --git a/website/tests/test_name_utils.py b/website/tests/test_name_utils.py new file mode 100644 index 00000000..771e59bb --- /dev/null +++ b/website/tests/test_name_utils.py @@ -0,0 +1,87 @@ +""" +Unit tests for website.utils.name_utils (pure logic, no DB). + +Covers the url_name normalization key and the readable-URL derivation +(``build_unique_url_name``) used by ``Person.save()`` and the +``recompute_url_names`` command for the #1275 dedup / #1206 namesake work. +""" + +from django.test import SimpleTestCase + +from website.utils.name_utils import ( + build_unique_url_name, + normalize_person_name, +) + + +class NormalizePersonNameTests(SimpleTestCase): + def test_basic_lowercases_and_concatenates(self): + self.assertEqual(normalize_person_name('Jon', 'Froehlich'), 'jonfroehlich') + + def test_accents_folded_and_punctuation_stripped(self): + self.assertEqual(normalize_person_name('Renée', "O'Brien"), 'reneeobrien') + + def test_acute_accents_and_cedilla_are_folded_not_dropped(self): + # Regression for the incomplete hand-rolled accent map (#1275): acute + # accents and the cedilla were dropped, mangling keys (cludiosilva) and + # hiding accented-name duplicates. NFKD folds them to ASCII. + self.assertEqual(normalize_person_name('Cláudio', 'Silva'), 'claudiosilva') + self.assertEqual(normalize_person_name('Edgar', 'Martínez'), 'edgarmartinez') + self.assertEqual(normalize_person_name('François', 'Guimbretière'), + 'francoisguimbretiere') + + def test_accented_namesakes_share_a_key(self): + # The whole point: "Cláudio Silva" and "Claudio Silva" must cluster. + self.assertEqual( + normalize_person_name('Cláudio', 'Silva'), + normalize_person_name('Claudio', 'Silva'), + ) + + def test_handles_none(self): + self.assertEqual(normalize_person_name(None, 'Zhang'), 'zhang') + + +class BuildUniqueUrlNameTests(SimpleTestCase): + def test_returns_bare_key_when_free(self): + never_taken = lambda u: False + self.assertEqual( + build_unique_url_name('Jasmine', 'Xin', 'Zhang', never_taken), + 'jasminezhang', + ) + + def test_prefers_middle_initial_on_collision(self): + taken = {'jasminezhang'}.__contains__ + self.assertEqual( + build_unique_url_name('Jasmine', 'Xin', 'Zhang', taken), + 'jasminexzhang', + ) + + def test_numeric_fallback_when_no_middle_name(self): + taken = {'jasminezhang'}.__contains__ + self.assertEqual( + build_unique_url_name('Jasmine', '', 'Zhang', taken), + 'jasminezhang2', + ) + + def test_numeric_fallback_when_middle_initial_also_taken(self): + taken = {'jasminezhang', 'jasminexzhang'}.__contains__ + self.assertEqual( + build_unique_url_name('Jasmine', 'Xin', 'Zhang', taken), + 'jasminezhang2', + ) + + def test_numeric_fallback_keeps_incrementing(self): + taken = {'jasminezhang', 'jasminezhang2', 'jasminezhang3'}.__contains__ + # No middle name, so it must skip past every taken numeric suffix. + self.assertEqual( + build_unique_url_name('Jasmine', None, 'Zhang', taken), + 'jasminezhang4', + ) + + def test_middle_initial_is_accent_folded(self): + # Middle name starting with an accented char folds to its ASCII initial. + taken = {'jonfroehlich'}.__contains__ + self.assertEqual( + build_unique_url_name('Jon', 'Über', 'Froehlich', taken), + 'jonufroehlich', + ) diff --git a/website/tests/test_recompute_url_names.py b/website/tests/test_recompute_url_names.py new file mode 100644 index 00000000..98df6042 --- /dev/null +++ b/website/tests/test_recompute_url_names.py @@ -0,0 +1,102 @@ +""" +Regression tests for url_name de-collision (#1206 / #1275): + +- the recompute_url_names management command, +- the refactored Person.save() derivation (middle-initial / numeric), and +- the hardened member view (clean 404 instead of a 500 or silent .first()). +""" + +from django.core.management import call_command +from django.urls import reverse + +from website.models import Person +from website.tests.base import DatabaseTestCase + + +def _force_shared_url_name(*people, url_name): + """Simulate historical pre-collision-loop rows sharing a bare url_name + (bypasses Person.save(), which would otherwise auto-de-collide).""" + Person.objects.filter(pk__in=[p.pk for p in people]).update(url_name=url_name) + + +class RecomputeUrlNamesCommandTests(DatabaseTestCase): + def test_decollides_to_distinct_url_names(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") # a.pk < b.pk + _force_shared_url_name(a, b, url_name="jiahaoli") + + call_command('recompute_url_names') + + a.refresh_from_db() + b.refresh_from_db() + self.assertEqual(a.url_name, "jiahaoli") # earliest keeps the bare name + self.assertEqual(b.url_name, "jiahaoli2") # no middle name -> numeric suffix + + def test_prefers_middle_initial_differentiator(self): + a = self.make_person("Jasmine", "Zhang") + b = self.make_person("Jasmine", "Zhang", middle_name="Xin") + _force_shared_url_name(a, b, url_name="jasminezhang") + + call_command('recompute_url_names') + + a.refresh_from_db() + b.refresh_from_db() + self.assertEqual(a.url_name, "jasminezhang") + self.assertEqual(b.url_name, "jasminexzhang") + + def test_is_idempotent(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + _force_shared_url_name(a, b, url_name="jiahaoli") + + call_command('recompute_url_names') + first = list(Person.objects.order_by('pk').values_list('url_name', flat=True)) + call_command('recompute_url_names') + second = list(Person.objects.order_by('pk').values_list('url_name', flat=True)) + self.assertEqual(first, second) + + def test_dry_run_changes_nothing(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + _force_shared_url_name(a, b, url_name="jiahaoli") + + call_command('recompute_url_names', dry_run=True) + + a.refresh_from_db() + b.refresh_from_db() + self.assertEqual(a.url_name, "jiahaoli") + self.assertEqual(b.url_name, "jiahaoli") # untouched + + +class PersonSaveUrlNameTests(DatabaseTestCase): + def test_new_namesake_with_middle_name_gets_middle_initial(self): + self.make_person("Jasmine", "Zhang") # claims jasminezhang + b = self.make_person("Jasmine", "Zhang", middle_name="Xin") + self.assertEqual(b.url_name, "jasminexzhang") + + def test_new_namesake_without_middle_name_gets_numeric_suffix(self): + self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + self.assertEqual(b.url_name, "jiahaoli2") + + +class MemberViewCollisionTests(DatabaseTestCase): + def test_namesakes_each_resolve_200_after_recompute(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + _force_shared_url_name(a, b, url_name="jiahaoli") + call_command('recompute_url_names') + + a.refresh_from_db() + b.refresh_from_db() + for person in (a, b): + url = reverse('website:member_by_name', kwargs={'member_name': person.url_name}) + self.assertEqual(self.client.get(url).status_code, 200) + + def test_unresolved_collision_returns_404_not_500(self): + a = self.make_person("Jiahao", "Li") + b = self.make_person("Jiahao", "Li") + _force_shared_url_name(a, b, url_name="jiahaoli") + + url = reverse('website:member_by_name', kwargs={'member_name': 'jiahaoli'}) + self.assertEqual(self.client.get(url).status_code, 404) diff --git a/website/utils/name_utils.py b/website/utils/name_utils.py index 7ac5dca0..1aafb4e1 100644 --- a/website/utils/name_utils.py +++ b/website/utils/name_utils.py @@ -3,23 +3,33 @@ The primary use is producing a stable, accent-folded key for a person's name so that records that *would* collide on ``Person.url_name`` can be -detected and clustered. The logic mirrors the url_name derivation in -``Person.save()`` (website/models/person.py) so callers — e.g. the -duplicate-people data-health check and a future ``recompute_url_names`` -command — agree on what "the same name" means. +detected and clustered. ``Person.save()`` derives ``url_name`` through these +helpers (via ``build_unique_url_name``), so the model, the duplicate-people +data-health check, and the ``recompute_url_names`` command all agree on what +"the same name" means. """ import re +import unicodedata + + +def _ascii_fold(text): + """ + Fold accented Latin characters to their ASCII base via Unicode NFKD + decomposition (e.g. ``á`` -> ``a``, ``ç`` -> ``c``, ``ñ`` -> ``n``), then + drop the combining marks. + + This replaces an earlier hand-maintained accent map that only covered + grave/circumflex/tilde vowels — it silently *dropped* acute accents and + the cedilla, mangling url_names (``Cláudio`` -> ``cludio``) and hiding + accented-name duplicates from the dedup check. NFKD handles the whole Latin + range generically, so the map no longer has to be kept in sync by hand. + """ + return ''.join( + c for c in unicodedata.normalize('NFKD', text) + if not unicodedata.combining(c) + ) -# Common accented characters mapped to ASCII. Kept in sync with the map in -# website/models/person.py used by Person.save() for url_name derivation. -SPECIAL_CHARS = { - 'ã': 'a', 'à': 'a', 'â': 'a', - 'é': 'e', 'è': 'e', 'ê': 'e', - 'ñ': 'n', 'ń': 'n', - 'ö': 'o', 'ô': 'o', - 'û': 'u', 'ü': 'u', 'ù': 'u', -} # Substrings identifying the auto-assigned "Star Wars" placeholder images # (see get_path_to_random_starwars_image / get_upload_to_for_person_easter_egg). @@ -43,12 +53,64 @@ def normalize_person_name(first_name, last_name): 'jonfroehlich' >>> normalize_person_name('Renée', "O'Brien") 'reneeobrien' + >>> normalize_person_name('Cláudio', 'Silva') + 'claudiosilva' + """ + cleaned = _ascii_fold(f"{first_name or ''}{last_name or ''}".lower()) + return re.sub('[^a-z]', '', cleaned) + + +def build_unique_url_name(first_name, middle_name, last_name, is_taken): """ - cleaned = f"{first_name or ''}{last_name or ''}".lower() - for c in cleaned: - if re.search('[^a-zA-Z]', c) and c in SPECIAL_CHARS: - cleaned = cleaned.replace(c, SPECIAL_CHARS[c]) - return re.sub('[^a-zA-Z]', '', cleaned) + Derive a unique ``url_name`` for a person, preferring readable URLs. + + Resolution order: + 1. The bare key ``normalize_person_name(first, last)`` (e.g. ``jasminezhang``) + if it isn't already taken. + 2. A **middle-initial differentiator** ``first + middle_initial + last`` + (e.g. ``jasminexzhang``) when a middle name is present and that form is + free — giving namesakes a stable, human-readable URL (issue #1206/#1275). + 3. A **numeric suffix** fallback (``jasminezhang2``, ``jasminezhang3`` …) — + the historical behavior, used when there's no usable middle initial or + the middle-initial form is itself taken. + + ``is_taken`` is a callable ``str -> bool`` that reports whether a candidate + ``url_name`` is already in use. The caller defines its semantics: when + re-deriving for an existing row it should exclude that row's own pk (mirroring + the ``.exclude(pk=self.pk)`` check in ``Person.save()``); when assigning in a + batch it should consult the names already handed out this pass. + + The result is always non-empty and lowercase alpha (plus an optional trailing + number), matching what ``Person.save()`` would store. + + Example: + >>> taken = {'jasminezhang'}.__contains__ + >>> build_unique_url_name('Jasmine', 'Xin', 'Zhang', taken) + 'jasminexzhang' + >>> build_unique_url_name('Jasmine', '', 'Zhang', taken) + 'jasminezhang2' + """ + base = normalize_person_name(first_name, last_name) + if not is_taken(base): + return base + + # Middle-initial differentiator: fold the first alpha char of the middle name + # the same way url_name derivation folds accents, then drop anything non-alpha. + middle = (middle_name or '').strip() + if middle: + initial = re.sub('[^a-z]', '', _ascii_fold(middle[0].lower())) + if initial: + first_key = normalize_person_name(first_name, '') + last_key = normalize_person_name('', last_name) + candidate = f"{first_key}{initial}{last_key}" + if candidate != base and not is_taken(candidate): + return candidate + + # Numeric-suffix fallback (matches the legacy Person.save() collision loop). + counter = 2 + while is_taken(f"{base}{counter}"): + counter += 1 + return f"{base}{counter}" def is_default_person_image(image_field): diff --git a/website/views/member.py b/website/views/member.py index c438001e..3657b673 100644 --- a/website/views/member.py +++ b/website/views/member.py @@ -64,14 +64,16 @@ def member(request, member_name=None, member_id=None): # Try a case-insensitive exact match person = get_object_or_404(Person, url_name__iexact=member_name) except MultipleObjectsReturned: - # This should not happen if url_name uniqueness is working correctly - # Log error and return the most recently bio-modified person as fallback - # (Person tracks bio_datetime_modified, not modified_date — the latter - # is not a field on the model and would raise FieldError.) - _logger.error(f"Multiple people found with url_name={member_name}! This indicates url_name uniqueness is broken. Returning most recent.") - person = Person.objects.filter(url_name__iexact=member_name).order_by('-bio_datetime_modified').first() - if person is None: - raise Http404("No person matches the given query.") + # url_name is kept unique by Person.save() and the recompute_url_names + # command (#1206/#1275), so this branch should be unreachable. If a + # collision ever recurs, fail loudly with a clean 404 rather than a 500 + # or a silent .first() pick (which would make one namesake permanently + # unreachable and could surface the wrong person). The fix is always to + # re-run `manage.py recompute_url_names`, not to guess a winner here. + _logger.error( + f"Multiple people share url_name={member_name!r} — url_name uniqueness is " + f"broken; run `manage.py recompute_url_names`. Returning 404.") + raise Http404("No unique person matches the given query.") except Http404: _logger.debug(f"{member_name} not found for url_name, looking for closest match in database") closest_urlname = get_closest_urlname_in_database(member_name)