RDMA/ucontext: Fix regression with disassociate
authorJason Gunthorpe <jgg@mellanox.com>
Tue, 16 Apr 2019 11:07:28 +0000 (14:07 +0300)
committerJason Gunthorpe <jgg@mellanox.com>
Wed, 24 Apr 2019 16:32:25 +0000 (13:32 -0300)
When this code was consolidated the intention was that the VMA would
become backed by anonymous zero pages after the zap_vma_pte - however this
very subtly relied on setting the vm_ops = NULL and clearing the VM_SHARED
bits to transform the VMA into an anonymous VMA. Since the vm_ops was
removed this broke.

Now userspace gets a SIGBUS if it touches the vma after disassociation.

Instead of converting the VMA to anonymous provide a fault handler that
puts a zero'd page into the VMA when user-space touches it after
disassociation.

Cc: stable@vger.kernel.org
Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
Fixes: 5f9794dc94f5 ("RDMA/ucontext: Add a core API for mmaping driver IO memory")
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_main.c

index ea0bc6885517b30da5c7a20d6e3805c582f3824b..32cc8fe7902f13dd5f6289ca36f5a8aca84a172b 100644 (file)
@@ -160,6 +160,7 @@ struct ib_uverbs_file {
 
        struct mutex umap_lock;
        struct list_head umaps;
+       struct page *disassociate_page;
 
        struct idr              idr;
        /* spinlock protects write access to idr */
index 70b7d80431a9b935b9a7ffa6fa50be6601f9c4a0..db20b6e0f253c92402d422c8080fc5b1385d47fc 100644 (file)
@@ -208,6 +208,9 @@ void ib_uverbs_release_file(struct kref *ref)
                kref_put(&file->async_file->ref,
                         ib_uverbs_release_async_event_file);
        put_device(&file->device->dev);
+
+       if (file->disassociate_page)
+               __free_pages(file->disassociate_page, 0);
        kfree(file);
 }
 
@@ -877,9 +880,50 @@ static void rdma_umap_close(struct vm_area_struct *vma)
        kfree(priv);
 }
 
+/*
+ * Once the zap_vma_ptes has been called touches to the VMA will come here and
+ * we return a dummy writable zero page for all the pfns.
+ */
+static vm_fault_t rdma_umap_fault(struct vm_fault *vmf)
+{
+       struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data;
+       struct rdma_umap_priv *priv = vmf->vma->vm_private_data;
+       vm_fault_t ret = 0;
+
+       if (!priv)
+               return VM_FAULT_SIGBUS;
+
+       /* Read only pages can just use the system zero page. */
+       if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) {
+               vmf->page = ZERO_PAGE(vmf->vm_start);
+               get_page(vmf->page);
+               return 0;
+       }
+
+       mutex_lock(&ufile->umap_lock);
+       if (!ufile->disassociate_page)
+               ufile->disassociate_page =
+                       alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0);
+
+       if (ufile->disassociate_page) {
+               /*
+                * This VMA is forced to always be shared so this doesn't have
+                * to worry about COW.
+                */
+               vmf->page = ufile->disassociate_page;
+               get_page(vmf->page);
+       } else {
+               ret = VM_FAULT_SIGBUS;
+       }
+       mutex_unlock(&ufile->umap_lock);
+
+       return ret;
+}
+
 static const struct vm_operations_struct rdma_umap_ops = {
        .open = rdma_umap_open,
        .close = rdma_umap_close,
+       .fault = rdma_umap_fault,
 };
 
 static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
@@ -889,6 +933,9 @@ static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
        struct ib_uverbs_file *ufile = ucontext->ufile;
        struct rdma_umap_priv *priv;
 
+       if (!(vma->vm_flags & VM_SHARED))
+               return ERR_PTR(-EINVAL);
+
        if (vma->vm_end - vma->vm_start != size)
                return ERR_PTR(-EINVAL);
 
@@ -992,7 +1039,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
                 * at a time to get the lock ordering right. Typically there
                 * will only be one mm, so no big deal.
                 */
-               down_write(&mm->mmap_sem);
+               down_read(&mm->mmap_sem);
                mutex_lock(&ufile->umap_lock);
                list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
                                          list) {
@@ -1004,10 +1051,9 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
 
                        zap_vma_ptes(vma, vma->vm_start,
                                     vma->vm_end - vma->vm_start);
-                       vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
                }
                mutex_unlock(&ufile->umap_lock);
-               up_write(&mm->mmap_sem);
+               up_read(&mm->mmap_sem);
                mmput(mm);
        }
 }