Skip to content

Commit 57d6c4b

Browse files
committed
fix: single-pass markdown rendering for correct TOC and heading IDs (#1323)
1 parent 85ac3d7 commit 57d6c4b

2 files changed

Lines changed: 266 additions & 30 deletions

File tree

server/utils/readme.ts

Lines changed: 85 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,12 @@ function resolveUrl(url: string, packageName: string, repoInfo?: RepositoryInfo)
251251
if (!url) return url
252252
if (url.startsWith('#')) {
253253
// Prefix anchor links to match heading IDs (avoids collision with page IDs)
254+
// Idempotent: don't double-prefix if already prefixed
255+
if (url.startsWith('#user-content-')) return url
254256
return `#user-content-${url.slice(1)}`
255257
}
258+
// Absolute paths (e.g. /package/foo from a previous npmjs redirect) are already resolved
259+
if (url.startsWith('/')) return url
256260
if (hasProtocol(url, { acceptRelative: true })) {
257261
try {
258262
const parsed = new URL(url, 'https://example.com')
@@ -381,37 +385,52 @@ export async function renderReadmeHtml(
381385
// So README starts at h3, and we ensure no levels are skipped
382386
// Visual styling preserved via data-level attribute (original depth)
383387
let lastSemanticLevel = 2 // Start after h2 (the "Readme" section heading)
384-
renderer.heading = function ({ tokens, depth }: Tokens.Heading) {
385-
// Calculate the target semantic level based on document structure
386-
// Start at h3 (since page h1 + section h2 already exist)
387-
// But ensure we never skip levels - can only go down by 1 or stay same/go up
388+
389+
// Shared heading processing for both markdown and HTML headings
390+
function processHeading(depth: number, plainText: string) {
388391
const semanticLevel = calculateSemanticDepth(depth, lastSemanticLevel)
389392
lastSemanticLevel = semanticLevel
390-
const text = this.parser.parseInline(tokens)
391393

392-
// Generate GitHub-style slug for anchor links
393-
let slug = slugify(text)
394-
if (!slug) slug = 'heading' // Fallback for empty headings
394+
let slug = slugify(plainText)
395+
if (!slug) slug = 'heading'
395396

396-
// Handle duplicate slugs (GitHub-style: foo, foo-1, foo-2)
397397
const count = usedSlugs.get(slug) ?? 0
398398
usedSlugs.set(slug, count + 1)
399399
const uniqueSlug = count === 0 ? slug : `${slug}-${count}`
400-
401-
// Prefix with 'user-content-' to avoid collisions with page IDs
402-
// (e.g., #install, #dependencies, #versions are used by the package page)
403400
const id = `user-content-${uniqueSlug}`
404401

405-
// Collect TOC item with plain text (HTML stripped, entities decoded)
406-
const plainText = decodeHtmlEntities(stripHtmlTags(text).trim())
407402
if (plainText) {
408403
toc.push({ text: plainText, id, depth })
409404
}
410405

411-
/** The link href uses the unique slug WITHOUT the 'user-content-' prefix, because that will later be added for all links. */
412406
return `<h${semanticLevel} id="${id}" data-level="${depth}"><a href="#${uniqueSlug}">${plainText}</a></h${semanticLevel}>\n`
413407
}
414408

409+
renderer.heading = function ({ tokens, depth }: Tokens.Heading) {
410+
const text = this.parser.parseInline(tokens)
411+
const plainText = decodeHtmlEntities(stripHtmlTags(text).trim())
412+
return processHeading(depth, plainText)
413+
}
414+
415+
// Intercept HTML headings so they get id, TOC entry, and correct semantic level.
416+
// Also intercept raw HTML <a> tags so playground links are collected in the same pass.
417+
const htmlHeadingRe = /<h([1-6])(\s[^>]*)?>([\s\S]*?)<\/h\1>/gi
418+
const htmlAnchorRe = /<a\s[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi
419+
renderer.html = function ({ text }: Tokens.HTML) {
420+
let result = text.replace(htmlHeadingRe, (_, level, _attrs, inner) => {
421+
const depth = parseInt(level)
422+
const plainText = decodeHtmlEntities(stripHtmlTags(inner).trim())
423+
return processHeading(depth, plainText).trimEnd()
424+
})
425+
// Process raw HTML <a> tags for playground link collection and URL resolution
426+
result = result.replace(htmlAnchorRe, (_full, href, inner) => {
427+
const label = decodeHtmlEntities(stripHtmlTags(inner).trim())
428+
const { resolvedHref, extraAttrs } = processLink(href, label)
429+
return `<a href="${resolvedHref}"${extraAttrs}>${inner}</a>`
430+
})
431+
return result
432+
}
433+
415434
// Syntax highlighting for code blocks (uses shared highlighter)
416435
renderer.code = ({ text, lang }: Tokens.Code) => {
417436
const html = highlightCodeSync(shiki, text, lang || 'text')
@@ -433,7 +452,35 @@ ${html}
433452
return `<img src="${resolvedHref}"${altAttr}${titleAttr}>`
434453
}
435454

455+
// Helper: resolve a link href, collect playground links, and build <a> attributes.
456+
// Used by both the markdown renderer.link and the HTML <a> interceptor so that
457+
// all link processing happens in a single pass during marked rendering.
458+
function processLink(href: string, label: string): { resolvedHref: string; extraAttrs: string } {
459+
const resolvedHref = resolveUrl(href, packageName, repoInfo)
460+
461+
// Collect playground links
462+
const provider = matchPlaygroundProvider(resolvedHref)
463+
if (provider && !seenUrls.has(resolvedHref)) {
464+
seenUrls.add(resolvedHref)
465+
collectedLinks.push({
466+
url: resolvedHref,
467+
provider: provider.id,
468+
providerName: provider.name,
469+
label: decodeHtmlEntities(label || provider.name),
470+
})
471+
}
472+
473+
// Security attributes for external links
474+
let extraAttrs = ''
475+
if (resolvedHref && hasProtocol(resolvedHref, { acceptRelative: true })) {
476+
extraAttrs = ' rel="nofollow noreferrer noopener" target="_blank"'
477+
}
478+
479+
return { resolvedHref, extraAttrs }
480+
}
481+
436482
// Resolve link URLs, add security attributes, and collect playground links
483+
// — all in a single pass during marked rendering (no deferred processing)
437484
renderer.link = function ({ href, title, tokens }: Tokens.Link) {
438485
const text = this.parser.parseInline(tokens)
439486
const titleAttr = title ? ` title="${title}"` : ''
@@ -444,10 +491,9 @@ ${html}
444491
plainText = tokens[0].text
445492
}
446493

447-
const intermediateTitleAttr =
448-
plainText || title ? ` data-title-intermediate="${plainText || title}"` : ''
494+
const { resolvedHref, extraAttrs } = processLink(href, plainText || title || '')
449495

450-
return `<a href="${href}"${titleAttr}${intermediateTitleAttr}>${text}</a>`
496+
return `<a href="${resolvedHref}"${titleAttr}${extraAttrs}>${text}</a>`
451497
}
452498

453499
// GitHub-style callouts: > [!NOTE], > [!TIP], etc.
@@ -475,26 +521,32 @@ ${html}
475521
allowedSchemes: ['http', 'https', 'mailto'],
476522
// Transform img src URLs (GitHub blob → raw, relative → GitHub raw)
477523
transformTags: {
524+
// Headings are already processed to correct semantic levels by processHeading()
525+
// during the marked rendering pass. The sanitizer just needs to preserve them.
526+
// For any stray headings that didn't go through processHeading (shouldn't happen),
527+
// we still apply a safe fallback shift.
478528
h1: (_, attribs) => {
529+
if (attribs['data-level']) return { tagName: 'h1', attribs }
479530
return { tagName: 'h3', attribs: { ...attribs, 'data-level': '1' } }
480531
},
481532
h2: (_, attribs) => {
533+
if (attribs['data-level']) return { tagName: 'h2', attribs }
482534
return { tagName: 'h4', attribs: { ...attribs, 'data-level': '2' } }
483535
},
484536
h3: (_, attribs) => {
485-
if (attribs['data-level']) return { tagName: 'h3', attribs: attribs }
537+
if (attribs['data-level']) return { tagName: 'h3', attribs }
486538
return { tagName: 'h5', attribs: { ...attribs, 'data-level': '3' } }
487539
},
488540
h4: (_, attribs) => {
489-
if (attribs['data-level']) return { tagName: 'h4', attribs: attribs }
541+
if (attribs['data-level']) return { tagName: 'h4', attribs }
490542
return { tagName: 'h6', attribs: { ...attribs, 'data-level': '4' } }
491543
},
492544
h5: (_, attribs) => {
493-
if (attribs['data-level']) return { tagName: 'h5', attribs: attribs }
545+
if (attribs['data-level']) return { tagName: 'h5', attribs }
494546
return { tagName: 'h6', attribs: { ...attribs, 'data-level': '5' } }
495547
},
496548
h6: (_, attribs) => {
497-
if (attribs['data-level']) return { tagName: 'h6', attribs: attribs }
549+
if (attribs['data-level']) return { tagName: 'h6', attribs }
498550
return { tagName: 'h6', attribs: { ...attribs, 'data-level': '6' } }
499551
},
500552
img: (tagName, attribs) => {
@@ -522,31 +574,34 @@ ${html}
522574
}
523575
return { tagName, attribs }
524576
},
577+
// Markdown links are fully processed in renderer.link (single-pass).
578+
// However, inline HTML <a> tags inside paragraphs are NOT seen by
579+
// renderer.html (marked parses them as paragraph tokens, not html tokens).
580+
// So we still need to collect playground links here for those cases.
581+
// The seenUrls set ensures no duplicates across both paths.
525582
a: (tagName, attribs) => {
526583
if (!attribs.href) {
527584
return { tagName, attribs }
528585
}
529586

530587
const resolvedHref = resolveUrl(attribs.href, packageName, repoInfo)
531588

589+
// Collect playground links from inline HTML <a> tags that weren't
590+
// caught by renderer.link or renderer.html
532591
const provider = matchPlaygroundProvider(resolvedHref)
533592
if (provider && !seenUrls.has(resolvedHref)) {
534593
seenUrls.add(resolvedHref)
535-
536594
collectedLinks.push({
537595
url: resolvedHref,
538596
provider: provider.id,
539597
providerName: provider.name,
540-
/**
541-
* We need to set some data attribute before hand because `transformTags` doesn't
542-
* provide the text of the element. This will automatically be removed, because there
543-
* is an allow list for link attributes.
544-
* */
545-
label: decodeHtmlEntities(attribs['data-title-intermediate'] || provider.name),
598+
// sanitize-html transformTags doesn't provide element text content,
599+
// so we fall back to the provider name for the label
600+
label: provider.name,
546601
})
547602
}
548603

549-
// Add security attributes for external links
604+
// Add security attributes for external links (idempotent)
550605
if (resolvedHref && hasProtocol(resolvedHref, { acceptRelative: true })) {
551606
attribs.rel = 'nofollow noreferrer noopener'
552607
attribs.target = '_blank'

test/unit/server/utils/readme.spec.ts

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,4 +475,185 @@ describe('HTML output', () => {
475475
<p>Some <strong>bold</strong> text and a <a href="https://example.com" rel="nofollow noreferrer noopener" target="_blank">link</a>.</p>
476476
`)
477477
})
478+
479+
it('adds id to raw HTML headings', async () => {
480+
const result = await renderReadmeHtml('<h1>Title</h1>', 'test-pkg')
481+
expect(result.html).toContain('id="user-content-title"')
482+
expect(result.toc).toHaveLength(1)
483+
expect(result.toc[0]).toMatchObject({ text: 'Title', depth: 1 })
484+
})
485+
486+
it('adds id to HTML heading in multi-element token', async () => {
487+
const md = '<h1 align="center">My Package</h1>\n<p align="center">A description</p>'
488+
const result = await renderReadmeHtml(md, 'test-pkg')
489+
expect(result.html).toContain('id="user-content-my-package"')
490+
expect(result.toc[0]).toMatchObject({ text: 'My Package', depth: 1 })
491+
})
492+
493+
it('handles duplicate raw HTML heading slugs', async () => {
494+
const md = '<h2>API</h2>\n\n<h2>API</h2>'
495+
const result = await renderReadmeHtml(md, 'test-pkg')
496+
expect(result.html).toContain('id="user-content-api"')
497+
expect(result.html).toContain('id="user-content-api-1"')
498+
})
499+
})
500+
501+
/**
502+
* Tests for issue #1323: single-pass markdown rendering behavior.
503+
*
504+
* The core concern is that mixing markdown headings and raw HTML headings
505+
* must produce TOC entries, heading IDs, and duplicate-slug suffixes in
506+
* exact document order — the same as GitHub does.
507+
*
508+
* If the implementation processes markdown headings in one pass and HTML
509+
* headings in a separate (later) pass, the ordering will be wrong.
510+
*/
511+
describe('Issue #1323 — single-pass rendering correctness', () => {
512+
describe('mixed markdown + HTML headings: TOC order and IDs', () => {
513+
it('produces TOC entries in document order when markdown and HTML headings are interleaved', async () => {
514+
// This is the core scenario from the issue: HTML headings appear
515+
// between markdown headings. A two-pass approach would collect all
516+
// markdown headings first, then HTML headings — scrambling the order.
517+
const md = [
518+
'# First (markdown)',
519+
'',
520+
'<h2>Second (html)</h2>',
521+
'',
522+
'## Third (markdown)',
523+
'',
524+
'<h2>Fourth (html)</h2>',
525+
'',
526+
'## Fifth (markdown)',
527+
].join('\n')
528+
529+
const result = await renderReadmeHtml(md, 'test-pkg')
530+
531+
// TOC must reflect exact document order
532+
expect(result.toc).toHaveLength(5)
533+
expect(result.toc[0]!.text).toBe('First (markdown)')
534+
expect(result.toc[1]!.text).toBe('Second (html)')
535+
expect(result.toc[2]!.text).toBe('Third (markdown)')
536+
expect(result.toc[3]!.text).toBe('Fourth (html)')
537+
expect(result.toc[4]!.text).toBe('Fifth (markdown)')
538+
})
539+
540+
it('assigns duplicate-slug suffixes in document order across mixed heading types', async () => {
541+
// Two markdown "API" headings with an HTML "API" heading in between.
542+
// Correct: api, api-1, api-2 in that order.
543+
// If HTML headings are processed in a separate pass, the HTML one
544+
// could get suffix -2 while the last markdown one gets -1.
545+
const md = ['## API', '', '<h2>API</h2>', '', '## API'].join('\n')
546+
547+
const result = await renderReadmeHtml(md, 'test-pkg')
548+
549+
expect(result.toc).toHaveLength(3)
550+
expect(result.toc[0]!.id).toBe('user-content-api')
551+
expect(result.toc[1]!.id).toBe('user-content-api-1')
552+
expect(result.toc[2]!.id).toBe('user-content-api-2')
553+
554+
// The HTML output must also have these IDs in order
555+
const ids = Array.from(result.html.matchAll(/id="(user-content-api(?:-\d+)?)"/g), m => m[1])
556+
expect(ids).toEqual(['user-content-api', 'user-content-api-1', 'user-content-api-2'])
557+
})
558+
559+
it('heading semantic levels are sequential even when mixing heading types', async () => {
560+
// h1 (md) → h3, h3 (html) → should be h4 (max = lastSemantic + 1),
561+
// not jump to h5 or h6 because it was processed in a later pass.
562+
const md = ['# Title', '', '<h3>Subsection</h3>', '', '#### Deep'].join('\n')
563+
564+
const result = await renderReadmeHtml(md, 'test-pkg')
565+
566+
// Extract semantic tags in order from the HTML
567+
const tags = Array.from(result.html.matchAll(/<h(\d)/g), m => Number(m[1]))
568+
// h1→h3, h3→h4 (sequential after h3), h4→h5 (sequential after h4)
569+
expect(tags).toEqual([3, 4, 5])
570+
})
571+
})
572+
573+
describe('HTML heading between markdown headings — ID and href consistency', () => {
574+
it('heading id and its anchor href point to the same slug', async () => {
575+
const md = ['# Introduction', '', '<h2>Getting Started</h2>', '', '## Installation'].join(
576+
'\n',
577+
)
578+
579+
const result = await renderReadmeHtml(md, 'test-pkg')
580+
581+
// For every heading, the slug used in id="user-content-{slug}" must
582+
// match the slug in the child anchor href="#user-content-{slug}"
583+
// (resolveUrl prefixes # anchors with user-content-).
584+
const headingPairs = [
585+
...result.html.matchAll(/id="user-content-([^"]+)"[^>]*><a href="#user-content-([^"]+)">/g),
586+
]
587+
expect(headingPairs.length).toBeGreaterThan(0)
588+
for (const match of headingPairs) {
589+
// slug portion must be identical
590+
expect(match[1]).toBe(match[2])
591+
}
592+
})
593+
})
594+
595+
describe('playground links collected from HTML <a> tags in single pass', () => {
596+
it('collects playground links from raw HTML anchor tags', async () => {
597+
// Some READMEs (like eslint's sponsor section) use raw HTML <a> tags
598+
// instead of markdown link syntax. These must also be picked up.
599+
const md = [
600+
'# My Package',
601+
'',
602+
'<a href="https://stackblitz.com/edit/my-demo">Open in StackBlitz</a>',
603+
'',
604+
'Some text with a [CodeSandbox link](https://codesandbox.io/s/example)',
605+
].join('\n')
606+
607+
const result = await renderReadmeHtml(md, 'test-pkg')
608+
609+
// Both playground links should be collected regardless of syntax
610+
const providers = result.playgroundLinks.map(l => l.provider)
611+
expect(providers).toContain('stackblitz')
612+
expect(providers).toContain('codesandbox')
613+
})
614+
})
615+
616+
describe('complex real-world interleaving (atproxy-like)', () => {
617+
it('handles a README with HTML h1 followed by markdown h2 and mixed content', async () => {
618+
// Simulates a pattern like atproxy's README where h1 is HTML
619+
// and subsequent headings are markdown
620+
const md = [
621+
'<h1 align="center">atproxy</h1>',
622+
'<p align="center">A cool proxy library</p>',
623+
'',
624+
'## Features',
625+
'',
626+
'- Fast',
627+
'- Simple',
628+
'',
629+
'## Installation',
630+
'',
631+
'```bash',
632+
'npm install atproxy',
633+
'```',
634+
'',
635+
'<h2>Advanced Usage</h2>',
636+
'',
637+
'## API',
638+
'',
639+
'### Methods',
640+
].join('\n')
641+
642+
const result = await renderReadmeHtml(md, 'test-pkg')
643+
644+
// TOC order must be: atproxy, Features, Installation, Advanced Usage, API, Methods
645+
expect(result.toc.map(t => t.text)).toEqual([
646+
'atproxy',
647+
'Features',
648+
'Installation',
649+
'Advanced Usage',
650+
'API',
651+
'Methods',
652+
])
653+
654+
// All IDs should be unique
655+
const ids = result.toc.map(t => t.id)
656+
expect(new Set(ids).size).toBe(ids.length)
657+
})
658+
})
478659
})

0 commit comments

Comments
 (0)