From 775741a3f8588cce6cb9cd35826ea968a6f898d9 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Tue, 19 May 2026 08:19:26 +0000 Subject: [PATCH 1/2] Drivers: hv: mshv_vtl: use folio-aware inserters for huge VTL0 mappings Since v6.15 (aed877c2b425, d3f7922b929a), GUP no longer takes a pgmap reference for ZONE_DEVICE pages and walks huge entries through the unified folio path. With vmf_insert_pfn_{pmd,pud}() the mapping holds no folio reference, so a zap racing with pin_user_pages_fast() can briefly drop the folio refcount to 0 and trigger a WARN in try_grab_folio() with the I/O failing as -ENOMEM. Switch the PMD/PUD fault paths to vmf_insert_folio_{pmd,pud}(), mirroring drivers/dax/device.c. Each map takes folio_get(); the matching folio_put() in zap keeps the refcount above 0. Gate the huge inserters on pfn_valid() + ZONE_DEVICE + MEMORY_DEVICE_GENERIC via mshv_vtl_low_resolve_page(); fall back to VM_FAULT_FALLBACK when the folio order does not match PMD_ORDER/PUD_ORDER or the PFN is not yet pgmap-backed, so the core can retry at smaller order. Add VM_DONTEXPAND to the VMA to block mremap() growth past the pgmap. Signed-off-by: Naman Jain --- drivers/hv/mshv_vtl_main.c | 122 ++++++++++++++++++++++++++++++++++--- 1 file changed, 113 insertions(+), 9 deletions(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index 21c04a24acec..c7e30c268c11 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1154,6 +1155,9 @@ static int vtl_set_vp_register(struct hv_register_assoc *reg) #define DECRYPTED_MASK (1ul << 51) +/* Identity token tagged on every mshv_vtl pgmap; only its address matters. */ +static const u8 mshv_vtl_pgmap_token; + static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) { struct mshv_vtl_ram_disposition vtl0_mem; @@ -1182,6 +1186,7 @@ static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1; pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_GENERIC; + pgmap->owner = (void *)&mshv_vtl_pgmap_token; if (decrypted) pgmap->flags = PGMAP_DECRYPTED; @@ -3640,6 +3645,18 @@ static struct miscdevice mshv_vtl_hvcall_dev = { .minor = MISC_DYNAMIC_MINOR, }; +/* + * Mirror drivers/dax/device.c: once the fault path publishes folio->mapping + * to this inode's address_space, writeback-side helpers (e.g. + * folio_mark_dirty() called from bio_set_pages_dirty() after direct I/O into + * a GUP'd VTL0 buffer) will dispatch through mapping->a_ops->dirty_folio. + * The default empty_aops leaves dirty_folio NULL, so install noop_dirty_folio + * to keep that dispatch safe; nothing here participates in real writeback. + */ +static const struct address_space_operations mshv_vtl_low_aops = { + .dirty_folio = noop_dirty_folio, +}; + static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) { pid_t pid = task_pid_vnr(current); @@ -3650,6 +3667,7 @@ static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) if (capable(CAP_SYS_ADMIN)) { filp->private_data = inodep; + inodep->i_mapping->a_ops = &mshv_vtl_low_aops; } else { pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d", __func__, pid, uid); @@ -3678,26 +3696,102 @@ static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *p return is_valid; } +/* + * Resolve a user-supplied PFN to a page owned by an mshv_vtl pgmap, or NULL. + * Look up the pgmap via get_dev_pagemap() rather than page_pgmap(): the pgmap + * is published in pgmap_array before per-page state is initialized, so a + * concurrent MSHV_ADD_VTL0_MEMORY can leave folio->pgmap unset while pfn_valid + * and is_zone_device_page already return true. The owner check additionally + * rejects foreign MEMORY_DEVICE_GENERIC pgmaps (e.g. DAX). + */ +static struct page *mshv_vtl_low_resolve_page(unsigned long pfn) +{ + struct dev_pagemap *pgmap; + struct page *page; + + pgmap = get_dev_pagemap(pfn); + if (!pgmap) + return NULL; + page = NULL; + if (pgmap->type == MEMORY_DEVICE_GENERIC && + pgmap->owner == &mshv_vtl_pgmap_token) + page = pfn_to_page(pfn); + /* Safe to drop here: mshv_vtl pgmaps are never released for the life of the module. */ + put_dev_pagemap(pgmap); + return page; +} + +/* + * Mirror dax_set_mapping(): rmap walkers locate a file-rmapped folio via + * folio->mapping/index. ZONE_DEVICE init only fills ->pgmap, so set the + * file-mapping fields here before each insert that adds file rmap. + * Idempotent: only the head folio carries mapping/index, and once set the + * fields persist for the lifetime of the (never-released) pgmap. + */ +static void mshv_vtl_low_set_mapping(struct vm_fault *vmf, struct folio *folio, + unsigned long fault_size) +{ + if (folio->mapping) + return; + + folio->mapping = vmf->vma->vm_file->f_mapping; + folio->index = linear_page_index(vmf->vma, + ALIGN_DOWN(vmf->address, fault_size)); +} + +/* + * Note on rmap/RSS accounting for huge VTL0 mappings: + * vmf_insert_folio_{pmd,pud}() takes a folio reference, adds a file rmap, + * and bumps mm RSS, but the matching teardown is skipped at zap/split time + * because vma_is_special_huge() is true (VM_MIXEDMAP) while vma_is_dax() is + * false (CONFIG_FS_DAX is not set in OHCL). The drift is theoretical for + * OpenVMM/OpenHCL: VTL0 memory is mapped once per partition and held for + * its lifetime - there is no map/unmap cycling, no partial munmap, and the + * driver is not unloaded. Stale refs land on ZONE_DEVICE folios whose + * pgmap is intentionally never released, no real bytes are leaked, and the + * mm's inflated RSS is discarded with the mm at process exit. + */ static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order) { unsigned long pfn = vmf->pgoff & ~DECRYPTED_MASK; - vm_fault_t ret = VM_FAULT_FALLBACK; + bool write = vmf->flags & FAULT_FLAG_WRITE; + struct page *page; + struct folio *folio; switch (order) { case 0: - /* __pfn_to_pfn_t ? */ + /* pte_special path; GUP bails before try_grab_folio() so the WARN cannot fire here. */ return vmf_insert_mixed(vmf->vma, vmf->address, pfn); case PMD_ORDER: - if (can_fault(vmf, PMD_SIZE, &pfn)) - ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); - return ret; + if (!can_fault(vmf, PMD_SIZE, &pfn)) + return VM_FAULT_FALLBACK; + page = mshv_vtl_low_resolve_page(pfn); + if (!page) + return VM_FAULT_FALLBACK; + folio = page_folio(page); + /* + * vmf_insert_folio_pmd() needs an exact-order folio; let core + * retry smaller on mismatch. + */ + if (folio_order(folio) != PMD_ORDER) + return VM_FAULT_FALLBACK; + mshv_vtl_low_set_mapping(vmf, folio, PMD_SIZE); + return vmf_insert_folio_pmd(vmf, folio, write); #if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) case PUD_ORDER: - if (can_fault(vmf, PUD_SIZE, &pfn)) - ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); - return ret; + if (!can_fault(vmf, PUD_SIZE, &pfn)) + return VM_FAULT_FALLBACK; + page = mshv_vtl_low_resolve_page(pfn); + if (!page) + return VM_FAULT_FALLBACK; + folio = page_folio(page); + /* Same exact-order requirement as the PMD case above. */ + if (folio_order(folio) != PUD_ORDER) + return VM_FAULT_FALLBACK; + mshv_vtl_low_set_mapping(vmf, folio, PUD_SIZE); + return vmf_insert_folio_pud(vmf, folio, write); #endif default: @@ -3717,8 +3811,18 @@ static const struct vm_operations_struct mshv_vtl_low_vm_ops = { static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma) { + /* + * Reject MAP_PRIVATE: the fault path installs PTEs via + * vmf_insert_{page,folio}_{,pmd,pud}() and bypasses core-mm COW, so + * MAP_PRIVATE writes would land on the underlying VTL0/device page + * instead of a private copy. Mirror device-dax (drivers/dax/device.c). + */ + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) + return -EINVAL; + vma->vm_ops = &mshv_vtl_low_vm_ops; - vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP); + /* VM_MIXEDMAP for the 4K pte_special path; VM_DONTEXPAND pins size to the pgmap. */ + vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP | VM_DONTEXPAND); if (vma->vm_pgoff & DECRYPTED_MASK) vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); From c79bbfd2a889304a133be002ac0aa6ebf88c988a Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Wed, 3 Jun 2026 05:24:26 +0000 Subject: [PATCH 2/2] Drivers: hv: mshv_vtl: fix GUP into VTL0 mappings on the 4K fault path Extend the folio-aware fault path to the 4K case so GUP into /dev/mshv_vtl_low works after MSHV_ADD_VTL0_MEMORY has registered the range. With the previous vmf_insert_mixed() path the PTE was always pte_special, vm_normal_page() returned NULL during pin_user_pages*(), follow_pfn_pte() returned -EEXIST, and io_uring O_DIRECT surfaced it as "disk io error: io error: File exists (os error 17)" on the first DMA into a freshly-registered VTL0 chunk. The 4K path now resolves the PFN via mshv_vtl_low_resolve_page(): when backed by an mshv_vtl pgmap the PTE is installed with vmf_insert_page_mkwrite(), giving GUP a normal pinnable page; otherwise it falls back to vmf_insert_mixed() so early CPU accesses (e.g. the VTL2 guest-memory self test reading GPA 0 before any add_vtl0_mem ioctl) still succeed instead of SIGBUSing. Such fallback PTEs would persist across registration and break later GUP. Capture the cdev's address_space on first open and, on successful MSHV_ADD_VTL0_MEMORY, invalidate the file-offset range via unmap_mapping_range() for both the encrypted (pfn) and decrypted (pfn | DECRYPTED_MASK) aliases that mshv_vtl_low_mmap() exposes. The next access re-faults into the folio path and GUP works. Signed-off-by: Naman Jain --- drivers/hv/mshv_vtl_main.c | 40 +++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c index c7e30c268c11..6fc958a6365f 100644 --- a/drivers/hv/mshv_vtl_main.c +++ b/drivers/hv/mshv_vtl_main.c @@ -1155,6 +1155,12 @@ static int vtl_set_vp_register(struct hv_register_assoc *reg) #define DECRYPTED_MASK (1ul << 51) +/* + * /dev/mshv_vtl_low address_space, captured on first open. + * Used by add_vtl0_mem() to zap stale 4K PTEs. + */ +static struct address_space *mshv_vtl_low_mapping; + /* Identity token tagged on every mshv_vtl pgmap; only its address matters. */ static const u8 mshv_vtl_pgmap_token; @@ -1210,6 +1216,20 @@ static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) return PTR_ERR(addr); } + /* + * Zap stale pte_special PTEs the 4K fallback installed before this + * range had a pgmap, so the next access re-faults into the folio path. + * Both encrypted (pfn) and decrypted (pfn | DECRYPTED_MASK) aliases. + */ + if (READ_ONCE(mshv_vtl_low_mapping)) { + pgoff_t start = vtl0_mem.start_pfn; + pgoff_t nr = vtl0_mem.last_pfn - vtl0_mem.start_pfn; + + unmap_mapping_pages(mshv_vtl_low_mapping, start, nr, true); + unmap_mapping_pages(mshv_vtl_low_mapping, + start | DECRYPTED_MASK, nr, true); + } + /* Don't free pgmap, since it has to stick around until the memory * is unmapped, which will never happen as there is no scenario * where VTL0 can be released/shutdown without bringing down VTL2. @@ -3667,6 +3687,9 @@ static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) if (capable(CAP_SYS_ADMIN)) { filp->private_data = inodep; + /* All opens share one inode; first one publishes the address_space. */ + if (!READ_ONCE(mshv_vtl_low_mapping)) + cmpxchg(&mshv_vtl_low_mapping, NULL, inodep->i_mapping); inodep->i_mapping->a_ops = &mshv_vtl_low_aops; } else { pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d", @@ -3760,8 +3783,19 @@ static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int ord switch (order) { case 0: - /* pte_special path; GUP bails before try_grab_folio() so the WARN cannot fire here. */ - return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + page = mshv_vtl_low_resolve_page(pfn); + if (!page) { + /* + * No pgmap yet: install pte_special so CPU access succeeds. + * The unmap_mapping_range() in add_vtl0_mem() invalidates this + * PTE on registration so a later GUP-bound access re-faults + * into the folio path below. + */ + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + } + /* Inserter operates on the compound-head folio per PTE; refcounts stay balanced. */ + mshv_vtl_low_set_mapping(vmf, page_folio(page), PAGE_SIZE); + return vmf_insert_page_mkwrite(vmf, page, write); case PMD_ORDER: if (!can_fault(vmf, PMD_SIZE, &pfn)) @@ -3821,7 +3855,7 @@ static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &mshv_vtl_low_vm_ops; - /* VM_MIXEDMAP for the 4K pte_special path; VM_DONTEXPAND pins size to the pgmap. */ + /* VM_MIXEDMAP for pte_special 4K fallback; VM_DONTEXPAND pins size to pgmap. */ vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP | VM_DONTEXPAND); if (vma->vm_pgoff & DECRYPTED_MASK)