From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
KVM can use memfd-provided memory for guest memory. For normal userspace
accessible memory, KVM userspace (e.g. QEMU) mmaps the memfd into its
virtual address space and then tells KVM to use the virtual address to
setup the mapping in the secondary page table (e.g. EPT).
With confidential computing technologies like Intel TDX, the
memfd-provided memory may be encrypted with special key for special
software domain (e.g. KVM guest) and is not expected to be directly
accessed by userspace. Precisely, userspace access to such encrypted
memory may lead to host crash so it should be prevented.
It provides semantics required for KVM guest private(encrypted) memory
support that a file descriptor with this flag set is going to be used as
the source of guest memory in confidential computing environments such
as Intel TDX/AMD SEV.
KVM userspace is still in charge of the lifecycle of the memfd. It
should pass the opened fd to KVM. KVM uses the kernel APIs newly added
in this patch to obtain the physical memory address and then populate
the secondary page table entries.
The userspace inaccessible memfd can be fallocate-ed and hole-punched
from userspace. When hole-punching happens, KVM can get notified through
inaccessible_notifier it then gets chance to remove any mapped entries
of the range in the secondary page tables.
The userspace inaccessible memfd itself is implemented as a shim layer
on top of real memory file systems like tmpfs/hugetlbfs but this patch
only implemented tmpfs. The allocated memory is currently marked as
unmovable and unevictable, this is required for current confidential
usage. But in future this might be changed.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
include/linux/memfd.h | 24 ++++
include/uapi/linux/magic.h | 1 +
include/uapi/linux/memfd.h | 1 +
mm/Makefile | 2 +-
mm/memfd.c | 25 ++++-
mm/memfd_inaccessible.c | 219 +++++++++++++++++++++++++++++++++++++
6 files changed, 270 insertions(+), 2 deletions(-)
create mode 100644 mm/memfd_inaccessible.c
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index 4f1600413f91..334ddff08377 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -3,6 +3,7 @@
#define __LINUX_MEMFD_H
#include <linux/file.h>
+#include <linux/pfn_t.h>
#ifdef CONFIG_MEMFD_CREATE
extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long
arg);
@@ -13,4 +14,27 @@ static inline long memfd_fcntl(struct file *f, unsigned int
c, unsigned long a)
}
#endif
+struct inaccessible_notifier;
+
+struct inaccessible_notifier_ops {
+ void (*invalidate)(struct inaccessible_notifier *notifier,
+ pgoff_t start, pgoff_t end);
+};
+
+struct inaccessible_notifier {
+ struct list_head list;
+ const struct inaccessible_notifier_ops *ops;
+};
+
+void inaccessible_register_notifier(struct file *file,
+ struct inaccessible_notifier *notifier);
+void inaccessible_unregister_notifier(struct file *file,
+ struct inaccessible_notifier *notifier);
+
+int inaccessible_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn,
+ int *order);
+void inaccessible_put_pfn(struct file *file, pfn_t pfn);
+
+struct file *memfd_mkinaccessible(struct file *memfd);
+
#endif /* __LINUX_MEMFD_H */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f..9d066be3d7e8 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -101,5 +101,6 @@
#define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
+#define INACCESSIBLE_MAGIC 0x494e4143 /* "INAC" */
+
+int inaccessible_get_pfn(struct file *file, pgoff_t offset, pfn_t *pfn,
+ int *order)
+{
+ struct inaccessible_data *data = file->f_mapping->private_data;
+ struct file *memfd = data->memfd;
+ struct page *page;
+ int ret;
+
+ ret = shmem_getpage(file_inode(memfd), offset, &page, SGP_WRITE);
+ if (ret)
+ return ret;
+
+ *pfn = page_to_pfn_t(page);
+ *order = thp_order(compound_head(page));
+ SetPageUptodate(page);
+ unlock_page(page);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(inaccessible_get_pfn);
+
+void inaccessible_put_pfn(struct file *file, pfn_t pfn)
+{
+ struct page *page = pfn_t_to_page(pfn);
+
+ if (WARN_ON_ONCE(!page))
+ return;
+
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(inaccessible_put_pfn);