fix: single-pass markdown rendering for correct TOC and heading IDs (#1323)

RYGRIT · RYGRIT · commit 57d6c4bd94bc · 2026-02-28T17:31:59.000+08:00
diff --git a/server/utils/readme.ts b/server/utils/readme.ts
@@ -251,8 +251,12 @@ function resolveUrl(url: string, packageName: string, repoInfo?: RepositoryInfo)
   if (!url) return url
   if (url.startsWith('#')) {
     // Prefix anchor links to match heading IDs (avoids collision with page IDs)
+    // Idempotent: don't double-prefix if already prefixed
+    if (url.startsWith('#user-content-')) return url
     return `#user-content-${url.slice(1)}`
   }
+  // Absolute paths (e.g. /package/foo from a previous npmjs redirect) are already resolved
+  if (url.startsWith('/')) return url
   if (hasProtocol(url, { acceptRelative: true })) {
     try {
       const parsed = new URL(url, 'https://example.com')
@@ -381,37 +385,52 @@ export async function renderReadmeHtml(
   // So README starts at h3, and we ensure no levels are skipped
   // Visual styling preserved via data-level attribute (original depth)
   let lastSemanticLevel = 2 // Start after h2 (the "Readme" section heading)
-  renderer.heading = function ({ tokens, depth }: Tokens.Heading) {
-    // Calculate the target semantic level based on document structure
-    // Start at h3 (since page h1 + section h2 already exist)
-    // But ensure we never skip levels - can only go down by 1 or stay same/go up
+
+  // Shared heading processing for both markdown and HTML headings
+  function processHeading(depth: number, plainText: string) {
     const semanticLevel = calculateSemanticDepth(depth, lastSemanticLevel)
     lastSemanticLevel = semanticLevel
-    const text = this.parser.parseInline(tokens)
 
-    // Generate GitHub-style slug for anchor links
-    let slug = slugify(text)
-    if (!slug) slug = 'heading' // Fallback for empty headings
+    let slug = slugify(plainText)
+    if (!slug) slug = 'heading'
 
-    // Handle duplicate slugs (GitHub-style: foo, foo-1, foo-2)
     const count = usedSlugs.get(slug) ?? 0
     usedSlugs.set(slug, count + 1)
     const uniqueSlug = count === 0 ? slug : `${slug}-${count}`
-
-    // Prefix with 'user-content-' to avoid collisions with page IDs
-    // (e.g., #install, #dependencies, #versions are used by the package page)
     const id = `user-content-${uniqueSlug}`
 
-    // Collect TOC item with plain text (HTML stripped, entities decoded)
-    const plainText = decodeHtmlEntities(stripHtmlTags(text).trim())
     if (plainText) {
       toc.push({ text: plainText, id, depth })
     }
 
-    /** The link href uses the unique slug WITHOUT the 'user-content-' prefix, because that will later be added for all links. */
     return `<h${semanticLevel} id="${id}" data-level="${depth}"><a href="#${uniqueSlug}">${plainText}</a></h${semanticLevel}>\n`
   }
 
+  renderer.heading = function ({ tokens, depth }: Tokens.Heading) {
+    const text = this.parser.parseInline(tokens)
+    const plainText = decodeHtmlEntities(stripHtmlTags(text).trim())
+    return processHeading(depth, plainText)
+  }
+
+  // Intercept HTML headings so they get id, TOC entry, and correct semantic level.
+  // Also intercept raw HTML <a> tags so playground links are collected in the same pass.
+  const htmlHeadingRe = /<h([1-6])(\s[^>]*)?>([\s\S]*?)<\/h\1>/gi
+  const htmlAnchorRe = /<a\s[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi
+  renderer.html = function ({ text }: Tokens.HTML) {
+    let result = text.replace(htmlHeadingRe, (_, level, _attrs, inner) => {
+      const depth = parseInt(level)
+      const plainText = decodeHtmlEntities(stripHtmlTags(inner).trim())
+      return processHeading(depth, plainText).trimEnd()
+    })
+    // Process raw HTML <a> tags for playground link collection and URL resolution
+    result = result.replace(htmlAnchorRe, (_full, href, inner) => {
+      const label = decodeHtmlEntities(stripHtmlTags(inner).trim())
+      const { resolvedHref, extraAttrs } = processLink(href, label)
+      return `<a href="${resolvedHref}"${extraAttrs}>${inner}</a>`
+    })
+    return result
+  }
+
   // Syntax highlighting for code blocks (uses shared highlighter)
   renderer.code = ({ text, lang }: Tokens.Code) => {
     const html = highlightCodeSync(shiki, text, lang || 'text')
@@ -433,7 +452,35 @@ ${html}
     return `<img src="${resolvedHref}"${altAttr}${titleAttr}>`
   }
 
+  // Helper: resolve a link href, collect playground links, and build <a> attributes.
+  // Used by both the markdown renderer.link and the HTML <a> interceptor so that
+  // all link processing happens in a single pass during marked rendering.
+  function processLink(href: string, label: string): { resolvedHref: string; extraAttrs: string } {
+    const resolvedHref = resolveUrl(href, packageName, repoInfo)
+
+    // Collect playground links
+    const provider = matchPlaygroundProvider(resolvedHref)
+    if (provider && !seenUrls.has(resolvedHref)) {
+      seenUrls.add(resolvedHref)
+      collectedLinks.push({
+        url: resolvedHref,
+        provider: provider.id,
+        providerName: provider.name,
+        label: decodeHtmlEntities(label || provider.name),
+      })
+    }
+
+    // Security attributes for external links
+    let extraAttrs = ''
+    if (resolvedHref && hasProtocol(resolvedHref, { acceptRelative: true })) {
+      extraAttrs = ' rel="nofollow noreferrer noopener" target="_blank"'
+    }
+
+    return { resolvedHref, extraAttrs }
+  }
+
   // Resolve link URLs, add security attributes, and collect playground links
+  // — all in a single pass during marked rendering (no deferred processing)
   renderer.link = function ({ href, title, tokens }: Tokens.Link) {
     const text = this.parser.parseInline(tokens)
     const titleAttr = title ? ` title="${title}"` : ''
@@ -444,10 +491,9 @@ ${html}
       plainText = tokens[0].text
     }
 
-    const intermediateTitleAttr =
-      plainText || title ? ` data-title-intermediate="${plainText || title}"` : ''
+    const { resolvedHref, extraAttrs } = processLink(href, plainText || title || '')
 
-    return `<a href="${href}"${titleAttr}${intermediateTitleAttr}>${text}</a>`
+    return `<a href="${resolvedHref}"${titleAttr}${extraAttrs}>${text}</a>`
   }
 
   // GitHub-style callouts: > [!NOTE], > [!TIP], etc.
@@ -475,26 +521,32 @@ ${html}
     allowedSchemes: ['http', 'https', 'mailto'],
     // Transform img src URLs (GitHub blob → raw, relative → GitHub raw)
     transformTags: {
+      // Headings are already processed to correct semantic levels by processHeading()
+      // during the marked rendering pass. The sanitizer just needs to preserve them.
+      // For any stray headings that didn't go through processHeading (shouldn't happen),
+      // we still apply a safe fallback shift.
       h1: (_, attribs) => {
+        if (attribs['data-level']) return { tagName: 'h1', attribs }
         return { tagName: 'h3', attribs: { ...attribs, 'data-level': '1' } }
       },
       h2: (_, attribs) => {
+        if (attribs['data-level']) return { tagName: 'h2', attribs }
         return { tagName: 'h4', attribs: { ...attribs, 'data-level': '2' } }
       },
       h3: (_, attribs) => {
-        if (attribs['data-level']) return { tagName: 'h3', attribs: attribs }
+        if (attribs['data-level']) return { tagName: 'h3', attribs }
         return { tagName: 'h5', attribs: { ...attribs, 'data-level': '3' } }
       },
       h4: (_, attribs) => {
-        if (attribs['data-level']) return { tagName: 'h4', attribs: attribs }
+        if (attribs['data-level']) return { tagName: 'h4', attribs }
         return { tagName: 'h6', attribs: { ...attribs, 'data-level': '4' } }
       },
       h5: (_, attribs) => {
-        if (attribs['data-level']) return { tagName: 'h5', attribs: attribs }
+        if (attribs['data-level']) return { tagName: 'h5', attribs }
         return { tagName: 'h6', attribs: { ...attribs, 'data-level': '5' } }
       },
       h6: (_, attribs) => {
-        if (attribs['data-level']) return { tagName: 'h6', attribs: attribs }
+        if (attribs['data-level']) return { tagName: 'h6', attribs }
         return { tagName: 'h6', attribs: { ...attribs, 'data-level': '6' } }
       },
       img: (tagName, attribs) => {
@@ -522,31 +574,34 @@ ${html}
         }
         return { tagName, attribs }
       },
+      // Markdown links are fully processed in renderer.link (single-pass).
+      // However, inline HTML <a> tags inside paragraphs are NOT seen by
+      // renderer.html (marked parses them as paragraph tokens, not html tokens).
+      // So we still need to collect playground links here for those cases.
+      // The seenUrls set ensures no duplicates across both paths.
       a: (tagName, attribs) => {
         if (!attribs.href) {
           return { tagName, attribs }
         }
 
         const resolvedHref = resolveUrl(attribs.href, packageName, repoInfo)
 
+        // Collect playground links from inline HTML <a> tags that weren't
+        // caught by renderer.link or renderer.html
         const provider = matchPlaygroundProvider(resolvedHref)
         if (provider && !seenUrls.has(resolvedHref)) {
           seenUrls.add(resolvedHref)
-
           collectedLinks.push({
             url: resolvedHref,
             provider: provider.id,
             providerName: provider.name,
-            /**
-             * We need to set some data attribute before hand because `transformTags` doesn't
-             * provide the text of the element. This will automatically be removed, because there
-             * is an allow list for link attributes.
-             * */
-            label: decodeHtmlEntities(attribs['data-title-intermediate'] || provider.name),
+            // sanitize-html transformTags doesn't provide element text content,
+            // so we fall back to the provider name for the label
+            label: provider.name,
           })
         }
 
-        // Add security attributes for external links
+        // Add security attributes for external links (idempotent)
         if (resolvedHref && hasProtocol(resolvedHref, { acceptRelative: true })) {
           attribs.rel = 'nofollow noreferrer noopener'
           attribs.target = '_blank'
diff --git a/test/unit/server/utils/readme.spec.ts b/test/unit/server/utils/readme.spec.ts
@@ -475,4 +475,185 @@ describe('HTML output', () => {
 <p>Some <strong>bold</strong> text and a <a href="https://example.com" rel="nofollow noreferrer noopener" target="_blank">link</a>.</p>
 `)
   })
+
+  it('adds id to raw HTML headings', async () => {
+    const result = await renderReadmeHtml('<h1>Title</h1>', 'test-pkg')
+    expect(result.html).toContain('id="user-content-title"')
+    expect(result.toc).toHaveLength(1)
+    expect(result.toc[0]).toMatchObject({ text: 'Title', depth: 1 })
+  })
+
+  it('adds id to HTML heading in multi-element token', async () => {
+    const md = '<h1 align="center">My Package</h1>\n<p align="center">A description</p>'
+    const result = await renderReadmeHtml(md, 'test-pkg')
+    expect(result.html).toContain('id="user-content-my-package"')
+    expect(result.toc[0]).toMatchObject({ text: 'My Package', depth: 1 })
+  })
+
+  it('handles duplicate raw HTML heading slugs', async () => {
+    const md = '<h2>API</h2>\n\n<h2>API</h2>'
+    const result = await renderReadmeHtml(md, 'test-pkg')
+    expect(result.html).toContain('id="user-content-api"')
+    expect(result.html).toContain('id="user-content-api-1"')
+  })
+})
+
+/**
+ * Tests for issue #1323: single-pass markdown rendering behavior.
+ *
+ * The core concern is that mixing markdown headings and raw HTML headings
+ * must produce TOC entries, heading IDs, and duplicate-slug suffixes in
+ * exact document order — the same as GitHub does.
+ *
+ * If the implementation processes markdown headings in one pass and HTML
+ * headings in a separate (later) pass, the ordering will be wrong.
+ */
+describe('Issue #1323 — single-pass rendering correctness', () => {
+  describe('mixed markdown + HTML headings: TOC order and IDs', () => {
+    it('produces TOC entries in document order when markdown and HTML headings are interleaved', async () => {
+      // This is the core scenario from the issue: HTML headings appear
+      // between markdown headings. A two-pass approach would collect all
+      // markdown headings first, then HTML headings — scrambling the order.
+      const md = [
+        '# First (markdown)',
+        '',
+        '<h2>Second (html)</h2>',
+        '',
+        '## Third (markdown)',
+        '',
+        '<h2>Fourth (html)</h2>',
+        '',
+        '## Fifth (markdown)',
+      ].join('\n')
+
+      const result = await renderReadmeHtml(md, 'test-pkg')
+
+      // TOC must reflect exact document order
+      expect(result.toc).toHaveLength(5)
+      expect(result.toc[0]!.text).toBe('First (markdown)')
+      expect(result.toc[1]!.text).toBe('Second (html)')
+      expect(result.toc[2]!.text).toBe('Third (markdown)')
+      expect(result.toc[3]!.text).toBe('Fourth (html)')
+      expect(result.toc[4]!.text).toBe('Fifth (markdown)')
+    })
+
+    it('assigns duplicate-slug suffixes in document order across mixed heading types', async () => {
+      // Two markdown "API" headings with an HTML "API" heading in between.
+      // Correct: api, api-1, api-2 in that order.
+      // If HTML headings are processed in a separate pass, the HTML one
+      // could get suffix -2 while the last markdown one gets -1.
+      const md = ['## API', '', '<h2>API</h2>', '', '## API'].join('\n')
+
+      const result = await renderReadmeHtml(md, 'test-pkg')
+
+      expect(result.toc).toHaveLength(3)
+      expect(result.toc[0]!.id).toBe('user-content-api')
+      expect(result.toc[1]!.id).toBe('user-content-api-1')
+      expect(result.toc[2]!.id).toBe('user-content-api-2')
+
+      // The HTML output must also have these IDs in order
+      const ids = Array.from(result.html.matchAll(/id="(user-content-api(?:-\d+)?)"/g), m => m[1])
+      expect(ids).toEqual(['user-content-api', 'user-content-api-1', 'user-content-api-2'])
+    })
+
+    it('heading semantic levels are sequential even when mixing heading types', async () => {
+      // h1 (md) → h3, h3 (html) → should be h4 (max = lastSemantic + 1),
+      // not jump to h5 or h6 because it was processed in a later pass.
+      const md = ['# Title', '', '<h3>Subsection</h3>', '', '#### Deep'].join('\n')
+
+      const result = await renderReadmeHtml(md, 'test-pkg')
+
+      // Extract semantic tags in order from the HTML
+      const tags = Array.from(result.html.matchAll(/<h(\d)/g), m => Number(m[1]))
+      // h1→h3, h3→h4 (sequential after h3), h4→h5 (sequential after h4)
+      expect(tags).toEqual([3, 4, 5])
+    })
+  })
+
+  describe('HTML heading between markdown headings — ID and href consistency', () => {
+    it('heading id and its anchor href point to the same slug', async () => {
+      const md = ['# Introduction', '', '<h2>Getting Started</h2>', '', '## Installation'].join(
+        '\n',
+      )
+
+      const result = await renderReadmeHtml(md, 'test-pkg')
+
+      // For every heading, the slug used in id="user-content-{slug}" must
+      // match the slug in the child anchor href="#user-content-{slug}"
+      // (resolveUrl prefixes # anchors with user-content-).
+      const headingPairs = [
+        ...result.html.matchAll(/id="user-content-([^"]+)"[^>]*><a href="#user-content-([^"]+)">/g),
+      ]
+      expect(headingPairs.length).toBeGreaterThan(0)
+      for (const match of headingPairs) {
+        // slug portion must be identical
+        expect(match[1]).toBe(match[2])
+      }
+    })
+  })
+
+  describe('playground links collected from HTML <a> tags in single pass', () => {
+    it('collects playground links from raw HTML anchor tags', async () => {
+      // Some READMEs (like eslint's sponsor section) use raw HTML <a> tags
+      // instead of markdown link syntax. These must also be picked up.
+      const md = [
+        '# My Package',
+        '',
+        '<a href="https://stackblitz.com/edit/my-demo">Open in StackBlitz</a>',
+        '',
+        'Some text with a [CodeSandbox link](https://codesandbox.io/s/example)',
+      ].join('\n')
+
+      const result = await renderReadmeHtml(md, 'test-pkg')
+
+      // Both playground links should be collected regardless of syntax
+      const providers = result.playgroundLinks.map(l => l.provider)
+      expect(providers).toContain('stackblitz')
+      expect(providers).toContain('codesandbox')
+    })
+  })
+
+  describe('complex real-world interleaving (atproxy-like)', () => {
+    it('handles a README with HTML h1 followed by markdown h2 and mixed content', async () => {
+      // Simulates a pattern like atproxy's README where h1 is HTML
+      // and subsequent headings are markdown
+      const md = [
+        '<h1 align="center">atproxy</h1>',
+        '<p align="center">A cool proxy library</p>',
+        '',
+        '## Features',
+        '',
+        '- Fast',
+        '- Simple',
+        '',
+        '## Installation',
+        '',
+        '```bash',
+        'npm install atproxy',
+        '```',
+        '',
+        '<h2>Advanced Usage</h2>',
+        '',
+        '## API',
+        '',
+        '### Methods',
+      ].join('\n')
+
+      const result = await renderReadmeHtml(md, 'test-pkg')
+
+      // TOC order must be: atproxy, Features, Installation, Advanced Usage, API, Methods
+      expect(result.toc.map(t => t.text)).toEqual([
+        'atproxy',
+        'Features',
+        'Installation',
+        'Advanced Usage',
+        'API',
+        'Methods',
+      ])
+
+      // All IDs should be unique
+      const ids = result.toc.map(t => t.id)
+      expect(new Set(ids).size).toBe(ids.length)
+    })
+  })
 })