qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v11 09/22] vfio iommu type1: Add task structure


From: Alex Williamson
Subject: Re: [Qemu-devel] [PATCH v11 09/22] vfio iommu type1: Add task structure to vfio_dma
Date: Mon, 7 Nov 2016 14:03:48 -0700

On Sat, 5 Nov 2016 02:40:43 +0530
Kirti Wankhede <address@hidden> wrote:

> Add task structure to vfio_dma.
> Add address space structure. Each vfio_dma structure points to the address
> space of the task who mapped it.
> List of address spaces is maintained in vfio_iommu structure.
> From DMA_MAP call if address space already exist in address space list,
> vfio_dma points to it. If address space doesn't exist, allocate address
> space, save pointer of mm to it and vfio_dma points to it.
> Two tasks can share same address space and so we need keep address space
> structure different from task in vfio_dma structure. vfio_dma keeps
> pointer to its corresponding address space.
> During DMA_UNMAP, same task who mapped it or other task who shares same
> address space is allowed to unmap, otherwise unmap fails.
> QEMU maps few iova ranges initially, then fork threads and from the child
> thread calls DMA_UNMAP on previously mapped iova. Since child shares same
> address space, DMA_UNMAP is successful.
> This address space structure is used to track pages pinned by external
> user in later changes.
> 
> Signed-off-by: Kirti Wankhede <address@hidden>
> Signed-off-by: Neo Jia <address@hidden>
> Change-Id: I7600f1bea6b384fd589fa72421ccf031bcfd9ac5
> ---
>  drivers/vfio/vfio_iommu_type1.c | 182 
> +++++++++++++++++++++++++++++-----------
>  1 file changed, 134 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 422c8d198abb..8d64528dcc22 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -55,12 +55,20 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>       struct list_head        domain_list;
> +     struct list_head        addr_space_list;
>       struct mutex            lock;
>       struct rb_root          dma_list;
>       bool                    v2;
>       bool                    nesting;
>  };
>  
> +/* address space */
> +struct vfio_addr_space {
> +     struct mm_struct        *mm;
> +     struct list_head        next;
> +     atomic_t                ref_count;
> +};
> +
>  struct vfio_domain {
>       struct iommu_domain     *domain;
>       struct list_head        next;
> @@ -75,6 +83,9 @@ struct vfio_dma {
>       unsigned long           vaddr;          /* Process virtual addr */
>       size_t                  size;           /* Map size (bytes) */
>       int                     prot;           /* IOMMU_READ/WRITE */
> +     struct vfio_addr_space  *addr_space;
> +     struct task_struct      *task;
> +     bool                    mlock_cap;
>  };
>  
>  struct vfio_group {
> @@ -130,6 +141,18 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
> struct vfio_dma *old)
>       rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu,
> +                                                 struct mm_struct *mm)
> +{
> +     struct vfio_addr_space *as;
> +
> +     list_for_each_entry(as, &iommu->addr_space_list, next) {
> +             if (as->mm == mm)
> +                     return as;
> +     }
> +     return NULL;
> +}
> +
>  struct vwork {
>       struct mm_struct        *mm;
>       long                    npage;
> @@ -273,24 +296,24 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned 
> long vaddr,
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long __vfio_pin_pages_remote(unsigned long vaddr, long npage,
> -                                 int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long 
> vaddr,
> +                                 long npage, int prot,
> +                                 unsigned long *pfn_base)
>  {
> -     unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> -     bool lock_cap = capable(CAP_IPC_LOCK);
> +     struct task_struct *task = dma->task;
> +     unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +     bool lock_cap = dma->mlock_cap;
> +     struct mm_struct *mm = dma->addr_space->mm;
>       long ret, i;
>       bool rsvd;
>  
> -     if (!current->mm)
> -             return -ENODEV;
> -
> -     ret = vaddr_get_pfn(current->mm, vaddr, prot, pfn_base);
> +     ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base);
>       if (ret)
>               return ret;
>  
>       rsvd = is_invalid_reserved_pfn(*pfn_base);
>  
> -     if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
> +     if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
>               put_pfn(*pfn_base, prot);
>               pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>                       limit << PAGE_SHIFT);
> @@ -299,7 +322,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, 
> long npage,
>  
>       if (unlikely(disable_hugepages)) {
>               if (!rsvd)
> -                     vfio_lock_acct(current->mm, 1);
> +                     vfio_lock_acct(mm, 1);
>               return 1;
>       }
>  
> @@ -307,7 +330,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, 
> long npage,
>       for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
>               unsigned long pfn = 0;
>  
> -             ret = vaddr_get_pfn(current->mm, vaddr, prot, &pfn);
> +             ret = vaddr_get_pfn(mm, vaddr, prot, &pfn);
>               if (ret)
>                       break;
>  
> @@ -318,7 +341,7 @@ static long __vfio_pin_pages_remote(unsigned long vaddr, 
> long npage,
>               }
>  
>               if (!rsvd && !lock_cap &&
> -                 current->mm->locked_vm + i + 1 > limit) {
> +                 mm->locked_vm + i + 1 > limit) {
>                       put_pfn(pfn, prot);
>                       pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>                               __func__, limit << PAGE_SHIFT);
> @@ -327,13 +350,13 @@ static long __vfio_pin_pages_remote(unsigned long 
> vaddr, long npage,
>       }
>  
>       if (!rsvd)
> -             vfio_lock_acct(current->mm, i);
> +             vfio_lock_acct(mm, i);
>  
>       return i;
>  }
>  
> -static long __vfio_unpin_pages_remote(unsigned long pfn, long npage,
> -                                   int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long 
> pfn,
> +                                   long npage, int prot, bool do_accounting)
>  {
>       unsigned long unlocked = 0;
>       long i;
> @@ -342,7 +365,7 @@ static long __vfio_unpin_pages_remote(unsigned long pfn, 
> long npage,
>               unlocked += put_pfn(pfn++, prot);
>  
>       if (do_accounting)
> -             vfio_lock_acct(current->mm, -unlocked);
> +             vfio_lock_acct(dma->addr_space->mm, -unlocked);
>  
>       return unlocked;
>  }
> @@ -396,7 +419,7 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, 
> struct vfio_dma *dma)
>               if (WARN_ON(!unmapped))
>                       break;
>  
> -             unlocked += __vfio_unpin_pages_remote(phys >> PAGE_SHIFT,
> +             unlocked += __vfio_unpin_pages_remote(dma, phys >> PAGE_SHIFT,
>                                                     unmapped >> PAGE_SHIFT,
>                                                     dma->prot, false);
>               iova += unmapped;
> @@ -404,13 +427,20 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, 
> struct vfio_dma *dma)
>               cond_resched();
>       }
>  
> -     vfio_lock_acct(current->mm, -unlocked);
> +     vfio_lock_acct(dma->addr_space->mm, -unlocked);
>  }
>  
>  static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
>  {
>       vfio_unmap_unpin(iommu, dma);
>       vfio_unlink_dma(iommu, dma);
> +
> +     if (atomic_dec_and_test(&dma->addr_space->ref_count)) {
> +             mmput(dma->addr_space->mm);
> +             put_task_struct(dma->task);
> +             list_del(&dma->addr_space->next);
> +             kfree(dma->addr_space);
> +     }
>       kfree(dma);
>  }
>  
> @@ -506,6 +536,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>       while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
>               if (!iommu->v2 && unmap->iova > dma->iova)
>                       break;
> +             /*
> +              * Task with same address space who mapped this iova range is
> +              * allowed to unmap the iova range.
> +              */
> +             if (dma->task->mm != current->mm)
> +                     break;
>               unmapped += dma->size;
>               vfio_remove_dma(iommu, dma);
>       }
> @@ -572,17 +608,58 @@ unwind:
>       return ret;
>  }
>  
> +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> +                         size_t map_size)
> +{
> +     dma_addr_t iova = dma->iova;
> +     unsigned long vaddr = dma->vaddr;
> +     size_t size = map_size;
> +     long npage;
> +     unsigned long pfn;
> +     int ret = 0;
> +
> +     while (size) {
> +             /* Pin a contiguous chunk of memory */
> +             npage = __vfio_pin_pages_remote(dma, vaddr + dma->size,
> +                                             size >> PAGE_SHIFT, dma->prot,
> +                                             &pfn);
> +             if (npage <= 0) {
> +                     WARN_ON(!npage);
> +                     ret = (int)npage;
> +                     break;
> +             }
> +
> +             /* Map it! */
> +             ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
> +                                  dma->prot);
> +             if (ret) {
> +                     __vfio_unpin_pages_remote(dma, pfn, npage, dma->prot,
> +                                               true);
> +                     break;
> +             }
> +
> +             size -= npage << PAGE_SHIFT;
> +             dma->size += npage << PAGE_SHIFT;
> +     }
> +
> +     if (ret)
> +             vfio_remove_dma(iommu, dma);
> +
> +     return ret;
> +}
> +
>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>                          struct vfio_iommu_type1_dma_map *map)
>  {
>       dma_addr_t iova = map->iova;
>       unsigned long vaddr = map->vaddr;
>       size_t size = map->size;
> -     long npage;
>       int ret = 0, prot = 0;
>       uint64_t mask;
>       struct vfio_dma *dma;
> -     unsigned long pfn;
> +     struct vfio_addr_space *addr_space;
> +     struct mm_struct *mm;
> +     bool free_addr_space_on_err = false;
>  
>       /* Verify that none of our __u64 fields overflow */
>       if (map->size != size || map->vaddr != vaddr || map->iova != iova)
> @@ -608,47 +685,56 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>       mutex_lock(&iommu->lock);
>  
>       if (vfio_find_dma(iommu, iova, size)) {
> -             mutex_unlock(&iommu->lock);
> -             return -EEXIST;
> +             ret = -EEXIST;
> +             goto do_map_err;
> +     }
> +
> +     mm = get_task_mm(current);
> +     if (!mm) {
> +             ret = -ENODEV;

-EFAULT?

> +             goto do_map_err;
> +     }
> +
> +     addr_space = vfio_find_addr_space(iommu, mm);
> +     if (addr_space) {
> +             atomic_inc(&addr_space->ref_count);
> +             mmput(mm);
> +     } else {
> +             addr_space = kzalloc(sizeof(*addr_space), GFP_KERNEL);
> +             if (!addr_space) {
> +                     ret = -ENOMEM;
> +                     goto do_map_err;
> +             }
> +             addr_space->mm = mm;
> +             atomic_set(&addr_space->ref_count, 1);
> +             list_add(&addr_space->next, &iommu->addr_space_list);
> +             free_addr_space_on_err = true;
>       }
>  
>       dma = kzalloc(sizeof(*dma), GFP_KERNEL);
>       if (!dma) {
> -             mutex_unlock(&iommu->lock);
> -             return -ENOMEM;
> +             if (free_addr_space_on_err) {
> +                     mmput(mm);
> +                     list_del(&addr_space->next);
> +                     kfree(addr_space);
> +             }
> +             ret = -ENOMEM;
> +             goto do_map_err;
>       }
>  
>       dma->iova = iova;
>       dma->vaddr = vaddr;
>       dma->prot = prot;
> +     dma->addr_space = addr_space;
> +     get_task_struct(current);
> +     dma->task = current;
> +     dma->mlock_cap = capable(CAP_IPC_LOCK);


How do you reason we can cache this?  Does the fact that the process
had this capability at the time that it did a DMA_MAP imply that it
necessarily still has this capability when an external user (vendor
driver) tries to pin pages?  I don't see how we can make that
assumption.


>  
>       /* Insert zero-sized and grow as we map chunks of it */
>       vfio_link_dma(iommu, dma);
>  
> -     while (size) {
> -             /* Pin a contiguous chunk of memory */
> -             npage = __vfio_pin_pages_remote(vaddr + dma->size,
> -                                             size >> PAGE_SHIFT, prot, &pfn);
> -             if (npage <= 0) {
> -                     WARN_ON(!npage);
> -                     ret = (int)npage;
> -                     break;
> -             }
> -
> -             /* Map it! */
> -             ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
> -             if (ret) {
> -                     __vfio_unpin_pages_remote(pfn, npage, prot, true);
> -                     break;
> -             }
> -
> -             size -= npage << PAGE_SHIFT;
> -             dma->size += npage << PAGE_SHIFT;
> -     }
> -
> -     if (ret)
> -             vfio_remove_dma(iommu, dma);
> -
> +     ret = vfio_pin_map_dma(iommu, dma, size);
> +do_map_err:
>       mutex_unlock(&iommu->lock);
>       return ret;
>  }




reply via email to

[Prev in Thread] Current Thread [Next in Thread]