Skip to content

Commit a697f15

Browse files
RYGRITghostdevv
andauthored
fix: change readme md renderer behavior (#1776)
Co-authored-by: Willow (GHOST) <git@willow.sh>
1 parent d2b21b1 commit a697f15

File tree

2 files changed

+427
-37
lines changed

2 files changed

+427
-37
lines changed

server/utils/readme.ts

Lines changed: 148 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ const ALLOWED_ATTR: Record<string, string[]> = {
210210
* - Collapse multiple hyphens
211211
*/
212212
function slugify(text: string): string {
213-
return stripHtmlTags(text)
213+
return decodeHtmlEntities(stripHtmlTags(text))
214214
.toLowerCase()
215215
.trim()
216216
.replace(/\s+/g, '-') // Spaces to hyphens
@@ -219,6 +219,14 @@ function slugify(text: string): string {
219219
.replace(/^-|-$/g, '') // Trim leading/trailing hyphens
220220
}
221221

222+
function getHeadingPlainText(text: string): string {
223+
return decodeHtmlEntities(stripHtmlTags(text).trim())
224+
}
225+
226+
function getHeadingSlugSource(text: string): string {
227+
return stripHtmlTags(text).trim()
228+
}
229+
222230
/**
223231
* Lazy ATX heading extension for marked: allows headings without a space after `#`.
224232
*
@@ -273,6 +281,30 @@ const reservedPathsNpmJs = [
273281

274282
const npmJsHosts = new Set(['www.npmjs.com', 'npmjs.com', 'www.npmjs.org', 'npmjs.org'])
275283

284+
const USER_CONTENT_PREFIX = 'user-content-'
285+
286+
function withUserContentPrefix(value: string): string {
287+
return value.startsWith(USER_CONTENT_PREFIX) ? value : `${USER_CONTENT_PREFIX}${value}`
288+
}
289+
290+
function toUserContentId(value: string): string {
291+
return `${USER_CONTENT_PREFIX}${value}`
292+
}
293+
294+
function toUserContentHash(value: string): string {
295+
return `#${withUserContentPrefix(value)}`
296+
}
297+
298+
function normalizePreservedAnchorAttrs(attrs: string): string {
299+
const cleanedAttrs = attrs
300+
.replace(/\s+href\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, '')
301+
.replace(/\s+rel\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, '')
302+
.replace(/\s+target\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, '')
303+
.trim()
304+
305+
return cleanedAttrs ? ` ${cleanedAttrs}` : ''
306+
}
307+
276308
const isNpmJsUrlThatCanBeRedirected = (url: URL) => {
277309
if (!npmJsHosts.has(url.host)) {
278310
return false
@@ -298,8 +330,11 @@ function resolveUrl(url: string, packageName: string, repoInfo?: RepositoryInfo)
298330
if (!url) return url
299331
if (url.startsWith('#')) {
300332
// Prefix anchor links to match heading IDs (avoids collision with page IDs)
301-
return `#user-content-${url.slice(1)}`
333+
// Idempotent: don't double-prefix if already prefixed
334+
return toUserContentHash(url.slice(1))
302335
}
336+
// Absolute paths (e.g. /package/foo from a previous npmjs redirect) are already resolved
337+
if (url.startsWith('/')) return url
303338
if (hasProtocol(url, { acceptRelative: true })) {
304339
try {
305340
const parsed = new URL(url, 'https://example.com')
@@ -388,8 +423,8 @@ function resolveImageUrl(url: string, packageName: string, repoInfo?: Repository
388423

389424
// Helper to prefix id attributes with 'user-content-'
390425
function prefixId(tagName: string, attribs: sanitizeHtml.Attributes) {
391-
if (attribs.id && !attribs.id.startsWith('user-content-')) {
392-
attribs.id = `user-content-${attribs.id}`
426+
if (attribs.id) {
427+
attribs.id = withUserContentPrefix(attribs.id)
393428
}
394429
return { tagName, attribs }
395430
}
@@ -428,35 +463,71 @@ export async function renderReadmeHtml(
428463
// So README starts at h3, and we ensure no levels are skipped
429464
// Visual styling preserved via data-level attribute (original depth)
430465
let lastSemanticLevel = 2 // Start after h2 (the "Readme" section heading)
431-
renderer.heading = function ({ tokens, depth }: Tokens.Heading) {
432-
// Calculate the target semantic level based on document structure
433-
// Start at h3 (since page h1 + section h2 already exist)
434-
// But ensure we never skip levels - can only go down by 1 or stay same/go up
466+
467+
// Shared heading processing for both markdown and HTML headings
468+
function processHeading(
469+
depth: number,
470+
displayHtml: string,
471+
plainText: string,
472+
slugSource: string,
473+
preservedAttrs = '',
474+
) {
435475
const semanticLevel = calculateSemanticDepth(depth, lastSemanticLevel)
436476
lastSemanticLevel = semanticLevel
437-
const text = this.parser.parseInline(tokens)
438477

439-
// Generate GitHub-style slug for anchor links
440-
let slug = slugify(text)
441-
if (!slug) slug = 'heading' // Fallback for empty headings
478+
let slug = slugify(slugSource)
479+
if (!slug) slug = 'heading'
442480

443-
// Handle duplicate slugs (GitHub-style: foo, foo-1, foo-2)
444481
const count = usedSlugs.get(slug) ?? 0
445482
usedSlugs.set(slug, count + 1)
446483
const uniqueSlug = count === 0 ? slug : `${slug}-${count}`
484+
const id = toUserContentId(uniqueSlug)
447485

448-
// Prefix with 'user-content-' to avoid collisions with page IDs
449-
// (e.g., #install, #dependencies, #versions are used by the package page)
450-
const id = `user-content-${uniqueSlug}`
451-
452-
// Collect TOC item with plain text (HTML stripped, entities decoded)
453-
const plainText = decodeHtmlEntities(stripHtmlTags(text).trim())
454486
if (plainText) {
455487
toc.push({ text: plainText, id, depth })
456488
}
457489

458-
/** The link href uses the unique slug WITHOUT the 'user-content-' prefix, because that will later be added for all links. */
459-
return `<h${semanticLevel} id="${id}" data-level="${depth}"><a href="#${uniqueSlug}">${plainText}</a></h${semanticLevel}>\n`
490+
return `<h${semanticLevel} id="${id}" data-level="${depth}"${preservedAttrs}><a href="#${id}">${displayHtml}</a></h${semanticLevel}>\n`
491+
}
492+
493+
renderer.heading = function ({ tokens, depth }: Tokens.Heading) {
494+
const displayHtml = this.parser.parseInline(tokens)
495+
const plainText = getHeadingPlainText(displayHtml)
496+
const slugSource = getHeadingSlugSource(displayHtml)
497+
return processHeading(depth, displayHtml, plainText, slugSource)
498+
}
499+
500+
// Extract and preserve allowed attributes from HTML heading tags
501+
function extractHeadingAttrs(attrsString: string): string {
502+
if (!attrsString) return ''
503+
const preserved: string[] = []
504+
const alignMatch = /\balign=(["']?)([^"'\s>]+)\1/i.exec(attrsString)
505+
if (alignMatch?.[2]) {
506+
preserved.push(`align="${alignMatch[2]}"`)
507+
}
508+
return preserved.length > 0 ? ` ${preserved.join(' ')}` : ''
509+
}
510+
511+
// Intercept HTML headings so they get id, TOC entry, and correct semantic level.
512+
// Also intercept raw HTML <a> tags so playground links are collected in the same pass.
513+
const htmlHeadingRe = /<h([1-6])(\s[^>]*)?>([\s\S]*?)<\/h\1>/gi
514+
const htmlAnchorRe = /<a(\s[^>]*?)href=(["'])([^"']*)\2([^>]*)>([\s\S]*?)<\/a>/gi
515+
renderer.html = function ({ text }: Tokens.HTML) {
516+
let result = text.replace(htmlHeadingRe, (_, level, attrs = '', inner) => {
517+
const depth = parseInt(level)
518+
const plainText = getHeadingPlainText(inner)
519+
const slugSource = getHeadingSlugSource(inner)
520+
const preservedAttrs = extractHeadingAttrs(attrs)
521+
return processHeading(depth, inner, plainText, slugSource, preservedAttrs).trimEnd()
522+
})
523+
// Process raw HTML <a> tags for playground link collection and URL resolution
524+
result = result.replace(htmlAnchorRe, (_full, beforeHref, _quote, href, afterHref, inner) => {
525+
const label = decodeHtmlEntities(stripHtmlTags(inner).trim())
526+
const { resolvedHref, extraAttrs } = processLink(href, label)
527+
const preservedAttrs = normalizePreservedAnchorAttrs(`${beforeHref ?? ''}${afterHref ?? ''}`)
528+
return `<a${preservedAttrs} href="${resolvedHref}"${extraAttrs}>${inner}</a>`
529+
})
530+
return result
460531
}
461532

462533
// Syntax highlighting for code blocks (uses shared highlighter)
@@ -480,7 +551,35 @@ ${html}
480551
return `<img src="${resolvedHref}"${altAttr}${titleAttr}>`
481552
}
482553

554+
// Helper: resolve a link href, collect playground links, and build <a> attributes.
555+
// Used by both the markdown renderer.link and the HTML <a> interceptor so that
556+
// all link processing happens in a single pass during marked rendering.
557+
function processLink(href: string, label: string): { resolvedHref: string; extraAttrs: string } {
558+
const resolvedHref = resolveUrl(href, packageName, repoInfo)
559+
560+
// Collect playground links
561+
const provider = matchPlaygroundProvider(resolvedHref)
562+
if (provider && !seenUrls.has(resolvedHref)) {
563+
seenUrls.add(resolvedHref)
564+
collectedLinks.push({
565+
url: resolvedHref,
566+
provider: provider.id,
567+
providerName: provider.name,
568+
label: decodeHtmlEntities(label || provider.name),
569+
})
570+
}
571+
572+
// Security attributes for external links
573+
let extraAttrs =
574+
resolvedHref && hasProtocol(resolvedHref, { acceptRelative: true })
575+
? ' rel="nofollow noreferrer noopener" target="_blank"'
576+
: ''
577+
578+
return { resolvedHref, extraAttrs }
579+
}
580+
483581
// Resolve link URLs, add security attributes, and collect playground links
582+
// — all in a single pass during marked rendering (no deferred processing)
484583
renderer.link = function ({ href, title, tokens }: Tokens.Link) {
485584
const text = this.parser.parseInline(tokens)
486585
const titleAttr = title ? ` title="${title}"` : ''
@@ -491,10 +590,9 @@ ${html}
491590
plainText = tokens[0].text
492591
}
493592

494-
const intermediateTitleAttr =
495-
plainText || title ? ` data-title-intermediate="${plainText || title}"` : ''
593+
const { resolvedHref, extraAttrs } = processLink(href, plainText || title || '')
496594

497-
return `<a href="${href}"${titleAttr}${intermediateTitleAttr}>${text}</a>`
595+
return `<a href="${resolvedHref}"${titleAttr}${extraAttrs}>${text}</a>`
498596
}
499597

500598
// GitHub-style callouts: > [!NOTE], > [!TIP], etc.
@@ -514,34 +612,44 @@ ${html}
514612

515613
marked.setOptions({ renderer })
516614

517-
const rawHtml = marked.parse(content) as string
615+
// Strip trailing whitespace (tabs/spaces) from code block closing fences.
616+
// While marky-markdown handles these gracefully, marked fails to recognize
617+
// the end of a code block if the closing fences are followed by unexpected whitespaces.
618+
const normalizedContent = content.replace(/^( {0,3}(?:`{3,}|~{3,}))\s*$/gm, '$1')
619+
const rawHtml = marked.parse(normalizedContent) as string
518620

519621
const sanitized = sanitizeHtml(rawHtml, {
520622
allowedTags: ALLOWED_TAGS,
521623
allowedAttributes: ALLOWED_ATTR,
522624
allowedSchemes: ['http', 'https', 'mailto'],
523625
// Transform img src URLs (GitHub blob → raw, relative → GitHub raw)
524626
transformTags: {
627+
// Headings are already processed to correct semantic levels by processHeading()
628+
// during the marked rendering pass. The sanitizer just needs to preserve them.
629+
// For any stray headings that didn't go through processHeading (shouldn't happen),
630+
// we still apply a safe fallback shift.
525631
h1: (_, attribs) => {
632+
if (attribs['data-level']) return { tagName: 'h1', attribs }
526633
return { tagName: 'h3', attribs: { ...attribs, 'data-level': '1' } }
527634
},
528635
h2: (_, attribs) => {
636+
if (attribs['data-level']) return { tagName: 'h2', attribs }
529637
return { tagName: 'h4', attribs: { ...attribs, 'data-level': '2' } }
530638
},
531639
h3: (_, attribs) => {
532-
if (attribs['data-level']) return { tagName: 'h3', attribs: attribs }
640+
if (attribs['data-level']) return { tagName: 'h3', attribs }
533641
return { tagName: 'h5', attribs: { ...attribs, 'data-level': '3' } }
534642
},
535643
h4: (_, attribs) => {
536-
if (attribs['data-level']) return { tagName: 'h4', attribs: attribs }
644+
if (attribs['data-level']) return { tagName: 'h4', attribs }
537645
return { tagName: 'h6', attribs: { ...attribs, 'data-level': '4' } }
538646
},
539647
h5: (_, attribs) => {
540-
if (attribs['data-level']) return { tagName: 'h5', attribs: attribs }
648+
if (attribs['data-level']) return { tagName: 'h5', attribs }
541649
return { tagName: 'h6', attribs: { ...attribs, 'data-level': '5' } }
542650
},
543651
h6: (_, attribs) => {
544-
if (attribs['data-level']) return { tagName: 'h6', attribs: attribs }
652+
if (attribs['data-level']) return { tagName: 'h6', attribs }
545653
return { tagName: 'h6', attribs: { ...attribs, 'data-level': '6' } }
546654
},
547655
img: (tagName, attribs) => {
@@ -569,31 +677,34 @@ ${html}
569677
}
570678
return { tagName, attribs }
571679
},
680+
// Markdown links are fully processed in renderer.link (single-pass).
681+
// However, inline HTML <a> tags inside paragraphs are NOT seen by
682+
// renderer.html (marked parses them as paragraph tokens, not html tokens).
683+
// So we still need to collect playground links here for those cases.
684+
// The seenUrls set ensures no duplicates across both paths.
572685
a: (tagName, attribs) => {
573686
if (!attribs.href) {
574687
return { tagName, attribs }
575688
}
576689

577690
const resolvedHref = resolveUrl(attribs.href, packageName, repoInfo)
578691

692+
// Collect playground links from inline HTML <a> tags that weren't
693+
// caught by renderer.link or renderer.html
579694
const provider = matchPlaygroundProvider(resolvedHref)
580695
if (provider && !seenUrls.has(resolvedHref)) {
581696
seenUrls.add(resolvedHref)
582-
583697
collectedLinks.push({
584698
url: resolvedHref,
585699
provider: provider.id,
586700
providerName: provider.name,
587-
/**
588-
* We need to set some data attribute before hand because `transformTags` doesn't
589-
* provide the text of the element. This will automatically be removed, because there
590-
* is an allow list for link attributes.
591-
* */
592-
label: decodeHtmlEntities(attribs['data-title-intermediate'] || provider.name),
701+
// sanitize-html transformTags doesn't provide element text content,
702+
// so we fall back to the provider name for the label
703+
label: provider.name,
593704
})
594705
}
595706

596-
// Add security attributes for external links
707+
// Add security attributes for external links (idempotent)
597708
if (resolvedHref && hasProtocol(resolvedHref, { acceptRelative: true })) {
598709
attribs.rel = 'nofollow noreferrer noopener'
599710
attribs.target = '_blank'

0 commit comments

Comments
 (0)