Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 148 additions & 10 deletions drivers/hv/mshv_vtl_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/user-return-notifier.h>
#include <linux/vmalloc.h>
#include <asm/boot.h>
Expand Down Expand Up @@ -1154,6 +1155,15 @@ static int vtl_set_vp_register(struct hv_register_assoc *reg)

#define DECRYPTED_MASK (1ul << 51)

/*
* /dev/mshv_vtl_low address_space, captured on first open.
* Used by add_vtl0_mem() to zap stale 4K PTEs.
*/
static struct address_space *mshv_vtl_low_mapping;

/* Identity token tagged on every mshv_vtl pgmap; only its address matters. */
static const u8 mshv_vtl_pgmap_token;

static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg)
{
struct mshv_vtl_ram_disposition vtl0_mem;
Expand Down Expand Up @@ -1182,6 +1192,7 @@ static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg)
pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1;
pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_GENERIC;
pgmap->owner = (void *)&mshv_vtl_pgmap_token;
if (decrypted)
pgmap->flags = PGMAP_DECRYPTED;

Expand All @@ -1205,6 +1216,20 @@ static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg)
return PTR_ERR(addr);
}

/*
* Zap stale pte_special PTEs the 4K fallback installed before this
* range had a pgmap, so the next access re-faults into the folio path.
* Both encrypted (pfn) and decrypted (pfn | DECRYPTED_MASK) aliases.
*/
if (READ_ONCE(mshv_vtl_low_mapping)) {
pgoff_t start = vtl0_mem.start_pfn;
pgoff_t nr = vtl0_mem.last_pfn - vtl0_mem.start_pfn;

unmap_mapping_pages(mshv_vtl_low_mapping, start, nr, true);
unmap_mapping_pages(mshv_vtl_low_mapping,
start | DECRYPTED_MASK, nr, true);
}

/* Don't free pgmap, since it has to stick around until the memory
* is unmapped, which will never happen as there is no scenario
* where VTL0 can be released/shutdown without bringing down VTL2.
Expand Down Expand Up @@ -3640,6 +3665,18 @@ static struct miscdevice mshv_vtl_hvcall_dev = {
.minor = MISC_DYNAMIC_MINOR,
};

/*
* Mirror drivers/dax/device.c: once the fault path publishes folio->mapping
* to this inode's address_space, writeback-side helpers (e.g.
* folio_mark_dirty() called from bio_set_pages_dirty() after direct I/O into
* a GUP'd VTL0 buffer) will dispatch through mapping->a_ops->dirty_folio.
* The default empty_aops leaves dirty_folio NULL, so install noop_dirty_folio
* to keep that dispatch safe; nothing here participates in real writeback.
*/
static const struct address_space_operations mshv_vtl_low_aops = {
.dirty_folio = noop_dirty_folio,
};

static int mshv_vtl_low_open(struct inode *inodep, struct file *filp)
{
pid_t pid = task_pid_vnr(current);
Expand All @@ -3650,6 +3687,10 @@ static int mshv_vtl_low_open(struct inode *inodep, struct file *filp)

if (capable(CAP_SYS_ADMIN)) {
filp->private_data = inodep;
/* All opens share one inode; first one publishes the address_space. */
if (!READ_ONCE(mshv_vtl_low_mapping))
cmpxchg(&mshv_vtl_low_mapping, NULL, inodep->i_mapping);
inodep->i_mapping->a_ops = &mshv_vtl_low_aops;
} else {
pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d",
__func__, pid, uid);
Expand Down Expand Up @@ -3678,26 +3719,113 @@ static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *p
return is_valid;
}

/*
* Resolve a user-supplied PFN to a page owned by an mshv_vtl pgmap, or NULL.
* Look up the pgmap via get_dev_pagemap() rather than page_pgmap(): the pgmap
* is published in pgmap_array before per-page state is initialized, so a
* concurrent MSHV_ADD_VTL0_MEMORY can leave folio->pgmap unset while pfn_valid
* and is_zone_device_page already return true. The owner check additionally
* rejects foreign MEMORY_DEVICE_GENERIC pgmaps (e.g. DAX).
*/
static struct page *mshv_vtl_low_resolve_page(unsigned long pfn)
{
struct dev_pagemap *pgmap;
struct page *page;

pgmap = get_dev_pagemap(pfn);
if (!pgmap)
return NULL;
page = NULL;
if (pgmap->type == MEMORY_DEVICE_GENERIC &&
pgmap->owner == &mshv_vtl_pgmap_token)
page = pfn_to_page(pfn);
/* Safe to drop here: mshv_vtl pgmaps are never released for the life of the module. */
put_dev_pagemap(pgmap);
return page;
}
Comment thread
namancse marked this conversation as resolved.

/*
* Mirror dax_set_mapping(): rmap walkers locate a file-rmapped folio via
* folio->mapping/index. ZONE_DEVICE init only fills ->pgmap, so set the
* file-mapping fields here before each insert that adds file rmap.
* Idempotent: only the head folio carries mapping/index, and once set the
* fields persist for the lifetime of the (never-released) pgmap.
*/
static void mshv_vtl_low_set_mapping(struct vm_fault *vmf, struct folio *folio,
unsigned long fault_size)
{
if (folio->mapping)
return;

folio->mapping = vmf->vma->vm_file->f_mapping;
folio->index = linear_page_index(vmf->vma,
ALIGN_DOWN(vmf->address, fault_size));
}

/*
* Note on rmap/RSS accounting for huge VTL0 mappings:
* vmf_insert_folio_{pmd,pud}() takes a folio reference, adds a file rmap,
* and bumps mm RSS, but the matching teardown is skipped at zap/split time
* because vma_is_special_huge() is true (VM_MIXEDMAP) while vma_is_dax() is
* false (CONFIG_FS_DAX is not set in OHCL). The drift is theoretical for
* OpenVMM/OpenHCL: VTL0 memory is mapped once per partition and held for
* its lifetime - there is no map/unmap cycling, no partial munmap, and the
* driver is not unloaded. Stale refs land on ZONE_DEVICE folios whose
* pgmap is intentionally never released, no real bytes are leaked, and the
* mm's inflated RSS is discarded with the mm at process exit.
*/
static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order)
{
unsigned long pfn = vmf->pgoff & ~DECRYPTED_MASK;
vm_fault_t ret = VM_FAULT_FALLBACK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
struct page *page;
struct folio *folio;

switch (order) {
case 0:
/* __pfn_to_pfn_t ? */
return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
page = mshv_vtl_low_resolve_page(pfn);
if (!page) {
/*
* No pgmap yet: install pte_special so CPU access succeeds.
* The unmap_mapping_range() in add_vtl0_mem() invalidates this
* PTE on registration so a later GUP-bound access re-faults
* into the folio path below.
*/
return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
}
/* Inserter operates on the compound-head folio per PTE; refcounts stay balanced. */
mshv_vtl_low_set_mapping(vmf, page_folio(page), PAGE_SIZE);
return vmf_insert_page_mkwrite(vmf, page, write);

case PMD_ORDER:
if (can_fault(vmf, PMD_SIZE, &pfn))
ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
return ret;
if (!can_fault(vmf, PMD_SIZE, &pfn))
return VM_FAULT_FALLBACK;
page = mshv_vtl_low_resolve_page(pfn);
if (!page)
return VM_FAULT_FALLBACK;
folio = page_folio(page);
/*
* vmf_insert_folio_pmd() needs an exact-order folio; let core
* retry smaller on mismatch.
*/
if (folio_order(folio) != PMD_ORDER)
return VM_FAULT_FALLBACK;
mshv_vtl_low_set_mapping(vmf, folio, PMD_SIZE);
return vmf_insert_folio_pmd(vmf, folio, write);

#if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
case PUD_ORDER:
if (can_fault(vmf, PUD_SIZE, &pfn))
ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
return ret;
if (!can_fault(vmf, PUD_SIZE, &pfn))
return VM_FAULT_FALLBACK;
page = mshv_vtl_low_resolve_page(pfn);
if (!page)
return VM_FAULT_FALLBACK;
folio = page_folio(page);
/* Same exact-order requirement as the PMD case above. */
if (folio_order(folio) != PUD_ORDER)
return VM_FAULT_FALLBACK;
mshv_vtl_low_set_mapping(vmf, folio, PUD_SIZE);
return vmf_insert_folio_pud(vmf, folio, write);
#endif

default:
Expand All @@ -3717,8 +3845,18 @@ static const struct vm_operations_struct mshv_vtl_low_vm_ops = {

static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma)
{
/*
* Reject MAP_PRIVATE: the fault path installs PTEs via
* vmf_insert_{page,folio}_{,pmd,pud}() and bypasses core-mm COW, so
* MAP_PRIVATE writes would land on the underlying VTL0/device page
* instead of a private copy. Mirror device-dax (drivers/dax/device.c).
*/
if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE)
return -EINVAL;

vma->vm_ops = &mshv_vtl_low_vm_ops;
vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP);
/* VM_MIXEDMAP for pte_special 4K fallback; VM_DONTEXPAND pins size to pgmap. */
vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP | VM_DONTEXPAND);

if (vma->vm_pgoff & DECRYPTED_MASK)
vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
Expand Down
Loading