qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v7 3/4] vfio iommu: Add support for mediated dev


From: Jike Song
Subject: Re: [Qemu-devel] [PATCH v7 3/4] vfio iommu: Add support for mediated devices
Date: Thu, 29 Sep 2016 10:17:23 +0800
User-agent: Mozilla/5.0 (X11; Linux i686 on x86_64; rv:17.0) Gecko/20130801 Thunderbird/17.0.8

+Guangrong

On 08/25/2016 11:53 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> Added two new callback functions to struct vfio_iommu_driver_ops. Backend
> IOMMU module that supports pining and unpinning pages for mdev devices
> should provide these functions.
> Added APIs for pining and unpining pages to VFIO module. These calls back
> into backend iommu module to actually pin and unpin pages.
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - two GPU pass through
> 
> Signed-off-by: Kirti Wankhede <address@hidden>
> Signed-off-by: Neo Jia <address@hidden>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> Reviewed-on: http://git-master/r/1175707
> Reviewed-by: Automatic_Commit_Validation_User
> ---
>  drivers/vfio/vfio.c             | 117 ++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 498 
> ++++++++++++++++++++++++++++++++++++----
>  include/linux/vfio.h            |  13 +-
>  3 files changed, 580 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 6fd6fa5469de..e3e342861e04 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1782,6 +1782,123 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, 
> size_t offset)
>  }
>  EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
>  
> +static struct vfio_group *vfio_group_from_dev(struct device *dev)
> +{
> +     struct vfio_device *device;
> +     struct vfio_group *group;
> +     int ret;
> +
> +     device = vfio_device_get_from_dev(dev);
> +     if (!device)
> +             return ERR_PTR(-EINVAL);
> +
> +     group = device->group;
> +     if (!atomic_inc_not_zero(&group->container_users)) {
> +             ret = -EINVAL;
> +             goto err_ret;
> +     }
> +
> +     if (group->noiommu) {
> +             atomic_dec(&group->container_users);
> +             ret = -EPERM;
> +             goto err_ret;
> +     }
> +
> +     if (!group->container->iommu_driver ||
> +         !vfio_group_viable(group)) {
> +             atomic_dec(&group->container_users);
> +             ret = -EINVAL;
> +             goto err_ret;
> +     }
> +
> +     vfio_device_put(device);
> +     return group;
> +
> +err_ret:
> +     vfio_device_put(device);
> +     return ERR_PTR(ret);
> +}
> +
> +/*
> + * Pin a set of guest PFNs and return their associated host PFNs for local
> + * domain only.
> + * @dev [in] : device
> + * @user_pfn [in]: array of user/guest PFNs
> + * @npage [in]: count of array elements
> + * @prot [in] : protection flags
> + * @phys_pfn[out] : array of host PFNs
> + */
> +long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +                 long npage, int prot, unsigned long *phys_pfn)
> +{
> +     struct vfio_container *container;
> +     struct vfio_group *group;
> +     struct vfio_iommu_driver *driver;
> +     ssize_t ret = -EINVAL;
> +
> +     if (!dev || !user_pfn || !phys_pfn)
> +             return -EINVAL;
> +
> +     group = vfio_group_from_dev(dev);
> +     if (IS_ERR(group))
> +             return PTR_ERR(group);
> +
> +     container = group->container;
> +     if (IS_ERR(container))
> +             return PTR_ERR(container);
> +
> +     down_read(&container->group_lock);
> +
> +     driver = container->iommu_driver;
> +     if (likely(driver && driver->ops->pin_pages))
> +             ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
> +                                          npage, prot, phys_pfn);
> +
> +     up_read(&container->group_lock);
> +     vfio_group_try_dissolve_container(group);
> +
> +     return ret;
> +
> +}
> +EXPORT_SYMBOL(vfio_pin_pages);
> +
> +/*
> + * Unpin set of host PFNs for local domain only.
> + * @dev [in] : device
> + * @pfn [in] : array of host PFNs to be unpinned.
> + * @npage [in] :count of elements in array, that is number of pages.
> + */
> +long vfio_unpin_pages(struct device *dev, unsigned long *pfn, long npage)
> +{
> +     struct vfio_container *container;
> +     struct vfio_group *group;
> +     struct vfio_iommu_driver *driver;
> +     ssize_t ret = -EINVAL;
> +
> +     if (!dev || !pfn)
> +             return -EINVAL;
> +
> +     group = vfio_group_from_dev(dev);
> +     if (IS_ERR(group))
> +             return PTR_ERR(group);
> +
> +     container = group->container;
> +     if (IS_ERR(container))
> +             return PTR_ERR(container);
> +
> +     down_read(&container->group_lock);
> +
> +     driver = container->iommu_driver;
> +     if (likely(driver && driver->ops->unpin_pages))
> +             ret = driver->ops->unpin_pages(container->iommu_data, pfn,
> +                                            npage);
> +
> +     up_read(&container->group_lock);
> +     vfio_group_try_dissolve_container(group);
> +     return ret;
> +}
> +EXPORT_SYMBOL(vfio_unpin_pages);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 2ba19424e4a1..d52d75fd0f04 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,18 +55,26 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>       struct list_head        domain_list;
> +     struct vfio_domain      *local_domain;
>       struct mutex            lock;
>       struct rb_root          dma_list;
>       bool                    v2;
>       bool                    nesting;
>  };
>  
> +struct local_addr_space {
> +     struct task_struct      *task;
> +     struct rb_root          pfn_list;       /* pinned Host pfn list */
> +     struct mutex            pfn_list_lock;  /* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
>       struct iommu_domain     *domain;
>       struct list_head        next;
>       struct list_head        group_list;
>       int                     prot;           /* IOMMU_CACHE */
>       bool                    fgsp;           /* Fine-grained super pages */
> +     struct local_addr_space *local_addr_space;
>  };
>  
>  struct vfio_dma {
> @@ -83,6 +91,22 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +     struct rb_node          node;
> +     unsigned long           vaddr;          /* virtual addr */
> +     dma_addr_t              iova;           /* IOVA */
> +     unsigned long           pfn;            /* Host pfn */
> +     size_t                  prot;
> +     atomic_t                ref_count;
> +};
> +
> +
> +#define IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)  \
> +                      (list_empty(&iommu->domain_list) ? false : true)
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +154,84 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
> struct vfio_dma *old)
>       rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +                                   unsigned long pfn)
> +{
> +     struct rb_node *node;
> +     struct vfio_pfn *vpfn, *ret = NULL;
> +
> +     node = domain->local_addr_space->pfn_list.rb_node;
> +
> +     while (node) {
> +             vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +             if (pfn < vpfn->pfn)
> +                     node = node->rb_left;
> +             else if (pfn > vpfn->pfn)
> +                     node = node->rb_right;
> +             else {
> +                     ret = vpfn;
> +                     break;
> +             }
> +     }
> +
> +     return ret;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +     struct rb_node **link, *parent = NULL;
> +     struct vfio_pfn *vpfn;
> +
> +     link = &domain->local_addr_space->pfn_list.rb_node;
> +     while (*link) {
> +             parent = *link;
> +             vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +             if (new->pfn < vpfn->pfn)
> +                     link = &(*link)->rb_left;
> +             else
> +                     link = &(*link)->rb_right;
> +     }
> +
> +     rb_link_node(&new->node, parent, link);
> +     rb_insert_color(&new->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +     rb_erase(&old->node, &domain->local_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long 
> vaddr,
> +                             dma_addr_t iova, unsigned long pfn, size_t prot)
> +{
> +     struct vfio_pfn *vpfn;
> +
> +     vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +     if (!vpfn)
> +             return -ENOMEM;
> +
> +     vpfn->vaddr = vaddr;
> +     vpfn->iova = iova;
> +     vpfn->pfn = pfn;
> +     vpfn->prot = prot;
> +     atomic_set(&vpfn->ref_count, 1);
> +     vfio_link_pfn(domain, vpfn);
> +     return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +                                   struct vfio_pfn *vpfn)
> +{
> +     vfio_unlink_pfn(domain, vpfn);
> +     kfree(vpfn);
> +}
> +
>  struct vwork {
>       struct mm_struct        *mm;
>       long                    npage;
> @@ -150,17 +252,17 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>       kfree(vwork);
>  }
>  
> -static void vfio_lock_acct(long npage)
> +static void vfio_lock_acct(struct task_struct *task, long npage)
>  {
>       struct vwork *vwork;
>       struct mm_struct *mm;
>  
> -     if (!current->mm || !npage)
> +     if (!task->mm || !npage)
>               return; /* process exited or nothing to do */
>  
> -     if (down_write_trylock(&current->mm->mmap_sem)) {
> -             current->mm->locked_vm += npage;
> -             up_write(&current->mm->mmap_sem);
> +     if (down_write_trylock(&task->mm->mmap_sem)) {
> +             task->mm->locked_vm += npage;
> +             up_write(&task->mm->mmap_sem);
>               return;
>       }
>  
> @@ -172,7 +274,7 @@ static void vfio_lock_acct(long npage)
>       vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>       if (!vwork)
>               return;
> -     mm = get_task_mm(current);
> +     mm = get_task_mm(task);
>       if (!mm) {
>               kfree(vwork);
>               return;
> @@ -228,20 +330,31 @@ static int put_pfn(unsigned long pfn, int prot)
>       return 0;
>  }
>  
> -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
> +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> +                      int prot, unsigned long *pfn)
>  {
>       struct page *page[1];
>       struct vm_area_struct *vma;
> +     struct mm_struct *local_mm = (mm ? mm : current->mm);
>       int ret = -EFAULT;
>  
> -     if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
> +     if (mm) {
> +             down_read(&local_mm->mmap_sem);
> +             ret = get_user_pages_remote(NULL, local_mm, vaddr, 1,
> +                                     !!(prot & IOMMU_WRITE), 0, page, NULL);
> +             up_read(&local_mm->mmap_sem);
> +     } else
> +             ret = get_user_pages_fast(vaddr, 1,
> +                                       !!(prot & IOMMU_WRITE), page);
> +
> +     if (ret == 1) {
>               *pfn = page_to_pfn(page[0]);
>               return 0;
>       }
>  
> -     down_read(&current->mm->mmap_sem);
> +     down_read(&local_mm->mmap_sem);
>  
> -     vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +     vma = find_vma_intersection(local_mm, vaddr, vaddr + 1);
>  
>       if (vma && vma->vm_flags & VM_PFNMAP) {
>               *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> @@ -249,7 +362,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, 
> unsigned long *pfn)
>                       ret = 0;
>       }
>  
> -     up_read(&current->mm->mmap_sem);
> +     up_read(&local_mm->mmap_sem);
>  
>       return ret;
>  }
> @@ -259,8 +372,8 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, 
> unsigned long *pfn)
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -                        int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
> +                                 int prot, unsigned long *pfn_base)
>  {
>       unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>       bool lock_cap = capable(CAP_IPC_LOCK);
> @@ -270,7 +383,7 @@ static long vfio_pin_pages(unsigned long vaddr, long 
> npage,
>       if (!current->mm)
>               return -ENODEV;
>  
> -     ret = vaddr_get_pfn(vaddr, prot, pfn_base);
> +     ret = vaddr_get_pfn(NULL, vaddr, prot, pfn_base);
>       if (ret)
>               return ret;
>  
> @@ -285,7 +398,7 @@ static long vfio_pin_pages(unsigned long vaddr, long 
> npage,
>  
>       if (unlikely(disable_hugepages)) {
>               if (!rsvd)
> -                     vfio_lock_acct(1);
> +                     vfio_lock_acct(current, 1);
>               return 1;
>       }
>  
> @@ -293,7 +406,7 @@ static long vfio_pin_pages(unsigned long vaddr, long 
> npage,
>       for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>               unsigned long pfn = 0;
>  
> -             ret = vaddr_get_pfn(vaddr, prot, &pfn);
> +             ret = vaddr_get_pfn(NULL, vaddr, prot, &pfn);
>               if (ret)
>                       break;
>  
> @@ -313,13 +426,13 @@ static long vfio_pin_pages(unsigned long vaddr, long 
> npage,
>       }
>  
>       if (!rsvd)
> -             vfio_lock_acct(i);
> +             vfio_lock_acct(current, i);
>  
>       return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -                          int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(unsigned long pfn, long npage, int 
> prot,
> +                                   bool do_accounting)
>  {
>       unsigned long unlocked = 0;
>       long i;
> @@ -328,7 +441,188 @@ static long vfio_unpin_pages(unsigned long pfn, long 
> npage,
>               unlocked += put_pfn(pfn++, prot);
>  
>       if (do_accounting)
> -             vfio_lock_acct(-unlocked);
> +             vfio_lock_acct(current, -unlocked);
> +     return unlocked;
> +}
> +
> +static long __vfio_pin_pages_local(struct vfio_domain *domain,
> +                                unsigned long vaddr, int prot,
> +                                unsigned long *pfn_base,
> +                                bool do_accounting)
> +{
> +     unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +     bool lock_cap = capable(CAP_IPC_LOCK);
> +     long ret;
> +     bool rsvd;
> +     struct task_struct *task = domain->local_addr_space->task;
> +
> +     if (!task->mm)
> +             return -ENODEV;
> +
> +     ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);
> +     if (ret)
> +             return ret;
> +
> +     rsvd = is_invalid_reserved_pfn(*pfn_base);
> +
> +     if (!rsvd && !lock_cap && task->mm->locked_vm + 1 > limit) {
> +             put_pfn(*pfn_base, prot);
> +             pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> +                     limit << PAGE_SHIFT);
> +             return -ENOMEM;
> +     }
> +
> +     if (!rsvd && do_accounting)
> +             vfio_lock_acct(task, 1);
> +
> +     return 1;
> +}
> +
> +static void __vfio_unpin_pages_local(struct vfio_domain *domain,
> +                                  unsigned long pfn, int prot,
> +                                  bool do_accounting)
> +{
> +     put_pfn(pfn, prot);
> +
> +     if (do_accounting)
> +             vfio_lock_acct(domain->local_addr_space->task, -1);
> +}
> +
> +static int vfio_unpin_pfn(struct vfio_domain *domain,
> +                       struct vfio_pfn *vpfn, bool do_accounting)
> +{
> +     __vfio_unpin_pages_local(domain, vpfn->pfn, vpfn->prot,
> +                              do_accounting);
> +
> +     if (atomic_dec_and_test(&vpfn->ref_count))
> +             vfio_remove_from_pfn_list(domain, vpfn);
> +
> +     return 1;
> +}
> +
> +static long vfio_iommu_type1_pin_pages(void *iommu_data,
> +                                    unsigned long *user_pfn,
> +                                    long npage, int prot,
> +                                    unsigned long *phys_pfn)
> +{
> +     struct vfio_iommu *iommu = iommu_data;
> +     struct vfio_domain *domain;
> +     int i, j, ret;
> +     long retpage;
> +     unsigned long remote_vaddr;
> +     unsigned long *pfn = phys_pfn;
> +     struct vfio_dma *dma;
> +     bool do_accounting = false;
> +
> +     if (!iommu || !user_pfn || !phys_pfn)
> +             return -EINVAL;
> +
> +     mutex_lock(&iommu->lock);
> +
> +     if (!iommu->local_domain) {
> +             ret = -EINVAL;
> +             goto pin_done;
> +     }
> +
> +     domain = iommu->local_domain;
> +
> +     /*
> +      * If iommu capable domain exist in the container then all pages are
> +      * already pinned and accounted. Accouting should be done if there is no
> +      * iommu capable domain in the container.
> +      */
> +     do_accounting = !IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu);
> +
> +     for (i = 0; i < npage; i++) {
> +             struct vfio_pfn *p;
> +             dma_addr_t iova;
> +
> +             iova = user_pfn[i] << PAGE_SHIFT;
> +
> +             dma = vfio_find_dma(iommu, iova, 0);
> +             if (!dma) {
> +                     ret = -EINVAL;
> +                     goto pin_unwind;
> +             }
> +
> +             remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +             retpage = __vfio_pin_pages_local(domain, remote_vaddr, prot,
> +                                              &pfn[i], do_accounting);

Hi Kirti,

Here you call __vfio_pin_pages_local() > vaddr_get_pfn() > GUP regardless
whether the vaddr already pinned or not. That probably means, if the caller 
calls vfio_pin_pages() with a GPA for multiple times, you get memory leaks.

GUP always increases the page refcnt.

FWIW, I would like to have the pfn_list_lock implemented with key == iova,
so you can always try to find the PFN for a given iova, and pin it only if
not found.

--
Thanks,
Jike


> +             if (retpage <= 0) {
> +                     WARN_ON(!retpage);
> +                     ret = (int)retpage;
> +                     goto pin_unwind;
> +             }
> +
> +             mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +             /* search if pfn exist */
> +             p = vfio_find_pfn(domain, pfn[i]);
> +             if (p) {
> +                     atomic_inc(&p->ref_count);
> +                     mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +                     continue;
> +             }
> +
> +             ret = vfio_add_to_pfn_list(domain, remote_vaddr, iova,
> +                                        pfn[i], prot);
> +             mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +             if (ret) {
> +                     __vfio_unpin_pages_local(domain, pfn[i], prot,
> +                                              do_accounting);
> +                     goto pin_unwind;
> +             }
> +     }
> +
> +     ret = i;
> +     goto pin_done;
> +
> +pin_unwind:
> +     pfn[i] = 0;
> +     mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +     for (j = 0; j < i; j++) {
> +             struct vfio_pfn *p;
> +
> +             p = vfio_find_pfn(domain, pfn[j]);
> +             if (p)
> +                     vfio_unpin_pfn(domain, p, do_accounting);
> +
> +             pfn[j] = 0;
> +     }
> +     mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +
> +pin_done:
> +     mutex_unlock(&iommu->lock);
> +     return ret;
> +}
> +
> +static long vfio_iommu_type1_unpin_pages(void *iommu_data, unsigned long 
> *pfn,
> +                                      long npage)
> +{
> +     struct vfio_iommu *iommu = iommu_data;
> +     struct vfio_domain *domain = NULL;
> +     long unlocked = 0;
> +     int i;
> +
> +     if (!iommu || !pfn)
> +             return -EINVAL;
> +
> +     domain = iommu->local_domain;
> +
> +     for (i = 0; i < npage; i++) {
> +             struct vfio_pfn *p;
> +
> +             mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +
> +             /* verify if pfn exist in pfn_list */
> +             p = vfio_find_pfn(domain, pfn[i]);
> +             if (p)
> +                     unlocked += vfio_unpin_pfn(domain, p, true);
> +
> +             mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +     }
>  
>       return unlocked;
>  }
> @@ -341,6 +635,9 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, 
> struct vfio_dma *dma)
>  
>       if (!dma->size)
>               return;
> +
> +     if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +             return;
>       /*
>        * We use the IOMMU to track the physical addresses, otherwise we'd
>        * need a much more complicated tracking system.  Unfortunately that
> @@ -382,15 +679,15 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, 
> struct vfio_dma *dma)
>               if (WARN_ON(!unmapped))
>                       break;
>  
> -             unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> -                                          unmapped >> PAGE_SHIFT,
> -                                          dma->prot, false);
> +             unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
> +                                                   unmapped >> PAGE_SHIFT,
> +                                                   dma->prot, false);
>               iova += unmapped;
>  
>               cond_resched();
>       }
>  
> -     vfio_lock_acct(-unlocked);
> +     vfio_lock_acct(current, -unlocked);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> @@ -611,10 +908,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>       /* Insert zero-sized and grow as we map chunks of it */
>       vfio_link_dma(iommu, dma);
>  
> +     /* Don't pin and map if container doesn't contain IOMMU capable domain*/
> +     if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu)) {
> +             dma->size = size;
> +             goto map_done;
> +     }
> +
>       while (size) {
>               /* Pin a contiguous chunk of memory */
> -             npage = vfio_pin_pages(vaddr + dma->size,
> -                                    size >> PAGE_SHIFT, prot, &pfn);
> +             npage = __vfio_pin_pages_remote(vaddr + dma->size,
> +                                             size >> PAGE_SHIFT, prot, &pfn);
>               if (npage <= 0) {
>                       WARN_ON(!npage);
>                       ret = (int)npage;
> @@ -624,7 +927,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>               /* Map it! */
>               ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
>               if (ret) {
> -                     vfio_unpin_pages(pfn, npage, prot, true);
> +                     __vfio_unpin_pages_remote(pfn, npage, prot, true);
>                       break;
>               }
>  
> @@ -635,6 +938,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>       if (ret)
>               vfio_remove_dma(iommu, dma);
>  
> +map_done:
>       mutex_unlock(&iommu->lock);
>       return ret;
>  }
> @@ -734,11 +1038,24 @@ static void vfio_test_domain_fgsp(struct vfio_domain 
> *domain)
>       __free_pages(pages, order);
>  }
>  
> +static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
> +                                struct iommu_group *iommu_group)
> +{
> +     struct vfio_group *g;
> +
> +     list_for_each_entry(g, &domain->group_list, next) {
> +             if (g->iommu_group == iommu_group)
> +                     return g;
> +     }
> +
> +     return NULL;
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>                                        struct iommu_group *iommu_group)
>  {
>       struct vfio_iommu *iommu = iommu_data;
> -     struct vfio_group *group, *g;
> +     struct vfio_group *group;
>       struct vfio_domain *domain, *d;
>       struct bus_type *bus = NULL;
>       int ret;
> @@ -746,10 +1063,14 @@ static int vfio_iommu_type1_attach_group(void 
> *iommu_data,
>       mutex_lock(&iommu->lock);
>  
>       list_for_each_entry(d, &iommu->domain_list, next) {
> -             list_for_each_entry(g, &d->group_list, next) {
> -                     if (g->iommu_group != iommu_group)
> -                             continue;
> +             if (find_iommu_group(d, iommu_group)) {
> +                     mutex_unlock(&iommu->lock);
> +                     return -EINVAL;
> +             }
> +     }
>  
> +     if (iommu->local_domain) {
> +             if (find_iommu_group(iommu->local_domain, iommu_group)) {
>                       mutex_unlock(&iommu->lock);
>                       return -EINVAL;
>               }
> @@ -769,6 +1090,33 @@ static int vfio_iommu_type1_attach_group(void 
> *iommu_data,
>       if (ret)
>               goto out_free;
>  
> +     if (IS_ENABLED(CONFIF_VFIO_MDEV) && !iommu_present(bus) &&
> +         (bus == &mdev_bus_type)) {
> +             if (iommu->local_domain) {
> +                     list_add(&group->next,
> +                              &iommu->local_domain->group_list);
> +                     kfree(domain);
> +                     mutex_unlock(&iommu->lock);
> +                     return 0;
> +             }
> +
> +             domain->local_addr_space = 
> kzalloc(sizeof(*domain->local_addr_space),
> +                                                GFP_KERNEL);
> +             if (!domain->local_addr_space) {
> +                     ret = -ENOMEM;
> +                     goto out_free;
> +             }
> +
> +             domain->local_addr_space->task = current;
> +             INIT_LIST_HEAD(&domain->group_list);
> +             list_add(&group->next, &domain->group_list);
> +             domain->local_addr_space->pfn_list = RB_ROOT;
> +             mutex_init(&domain->local_addr_space->pfn_list_lock);
> +             iommu->local_domain = domain;
> +             mutex_unlock(&iommu->lock);
> +             return 0;
> +     }
> +
>       domain->domain = iommu_domain_alloc(bus);
>       if (!domain->domain) {
>               ret = -EIO;
> @@ -859,6 +1207,18 @@ static void vfio_iommu_unmap_unpin_all(struct 
> vfio_iommu *iommu)
>               vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
>  }
>  
> +static void vfio_local_unpin_all(struct vfio_domain *domain)
> +{
> +     struct rb_node *node;
> +
> +     mutex_lock(&domain->local_addr_space->pfn_list_lock);
> +     while ((node = rb_first(&domain->local_addr_space->pfn_list))) {
> +             vfio_unpin_pfn(domain,
> +                             rb_entry(node, struct vfio_pfn, node), false);
> +     }
> +     mutex_unlock(&domain->local_addr_space->pfn_list_lock);
> +}
> +
>  static void vfio_iommu_type1_detach_group(void *iommu_data,
>                                         struct iommu_group *iommu_group)
>  {
> @@ -868,31 +1228,52 @@ static void vfio_iommu_type1_detach_group(void 
> *iommu_data,
>  
>       mutex_lock(&iommu->lock);
>  
> -     list_for_each_entry(domain, &iommu->domain_list, next) {
> -             list_for_each_entry(group, &domain->group_list, next) {
> -                     if (group->iommu_group != iommu_group)
> -                             continue;
> +     if (iommu->local_domain) {
> +             domain = iommu->local_domain;
> +             group = find_iommu_group(domain, iommu_group);
> +             if (group) {
> +                     list_del(&group->next);
> +                     kfree(group);
>  
> +                     if (list_empty(&domain->group_list)) {
> +                             vfio_local_unpin_all(domain);
> +                             if 
> (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +                                     vfio_iommu_unmap_unpin_all(iommu);
> +                             kfree(domain);
> +                             iommu->local_domain = NULL;
> +                     }
> +                     goto detach_group_done;
> +             }
> +     }
> +
> +     if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +             goto detach_group_done;
> +
> +     list_for_each_entry(domain, &iommu->domain_list, next) {
> +             group = find_iommu_group(domain, iommu_group);
> +             if (group) {
>                       iommu_detach_group(domain->domain, iommu_group);
>                       list_del(&group->next);
>                       kfree(group);
>                       /*
>                        * Group ownership provides privilege, if the group
>                        * list is empty, the domain goes away.  If it's the
> -                      * last domain, then all the mappings go away too.
> +                      * last domain with iommu and local domain doesn't
> +                      * exist, the all the mappings go away too.
>                        */
>                       if (list_empty(&domain->group_list)) {
> -                             if (list_is_singular(&iommu->domain_list))
> +                             if (list_is_singular(&iommu->domain_list) &&
> +                                (!iommu->local_domain))
>                                       vfio_iommu_unmap_unpin_all(iommu);
>                               iommu_domain_free(domain->domain);
>                               list_del(&domain->next);
>                               kfree(domain);
>                       }
> -                     goto done;
> +                     break;
>               }
>       }
>  
> -done:
> +detach_group_done:
>       mutex_unlock(&iommu->lock);
>  }
>  
> @@ -924,27 +1305,48 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>       return iommu;
>  }
>  
> +static void vfio_release_domain(struct vfio_domain *domain)
> +{
> +     struct vfio_group *group, *group_tmp;
> +
> +     list_for_each_entry_safe(group, group_tmp,
> +                              &domain->group_list, next) {
> +             if (!domain->local_addr_space)
> +                     iommu_detach_group(domain->domain, group->iommu_group);
> +             list_del(&group->next);
> +             kfree(group);
> +     }
> +
> +     if (domain->local_addr_space)
> +             vfio_local_unpin_all(domain);
> +     else
> +             iommu_domain_free(domain->domain);
> +}
> +
>  static void vfio_iommu_type1_release(void *iommu_data)
>  {
>       struct vfio_iommu *iommu = iommu_data;
>       struct vfio_domain *domain, *domain_tmp;
> -     struct vfio_group *group, *group_tmp;
> +
> +     if (iommu->local_domain) {
> +             vfio_release_domain(iommu->local_domain);
> +             kfree(iommu->local_domain);
> +             iommu->local_domain = NULL;
> +     }
>  
>       vfio_iommu_unmap_unpin_all(iommu);
>  
> +     if (!IS_IOMMU_CAPABLE_DOMAIN_IN_CONTAINER(iommu))
> +             goto release_exit;
> +
>       list_for_each_entry_safe(domain, domain_tmp,
>                                &iommu->domain_list, next) {
> -             list_for_each_entry_safe(group, group_tmp,
> -                                      &domain->group_list, next) {
> -                     iommu_detach_group(domain->domain, group->iommu_group);
> -                     list_del(&group->next);
> -                     kfree(group);
> -             }
> -             iommu_domain_free(domain->domain);
> +             vfio_release_domain(domain);
>               list_del(&domain->next);
>               kfree(domain);
>       }
>  
> +release_exit:
>       kfree(iommu);
>  }
>  
> @@ -1048,6 +1450,8 @@ static const struct vfio_iommu_driver_ops 
> vfio_iommu_driver_ops_type1 = {
>       .ioctl          = vfio_iommu_type1_ioctl,
>       .attach_group   = vfio_iommu_type1_attach_group,
>       .detach_group   = vfio_iommu_type1_detach_group,
> +     .pin_pages      = vfio_iommu_type1_pin_pages,
> +     .unpin_pages    = vfio_iommu_type1_unpin_pages,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 0ecae0b1cd34..0bd25ba6223d 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -17,6 +17,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <uapi/linux/vfio.h>
> +#include <linux/mdev.h>
>  
>  /**
>   * struct vfio_device_ops - VFIO bus driver device callbacks
> @@ -75,7 +76,11 @@ struct vfio_iommu_driver_ops {
>                                       struct iommu_group *group);
>       void            (*detach_group)(void *iommu_data,
>                                       struct iommu_group *group);
> -
> +     long            (*pin_pages)(void *iommu_data, unsigned long *user_pfn,
> +                                  long npage, int prot,
> +                                  unsigned long *phys_pfn);
> +     long            (*unpin_pages)(void *iommu_data, unsigned long *pfn,
> +                                    long npage);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops 
> *ops);
> @@ -127,6 +132,12 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct 
> iommu_group *group,
>  }
>  #endif /* CONFIG_EEH */
>  
> +extern long vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
> +                        long npage, int prot, unsigned long *phys_pfn);
> +
> +extern long vfio_unpin_pages(struct device *dev, unsigned long *pfn,
> +                          long npage);
> +
>  /*
>   * IRQfd - generic
>   */
> 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]