RDMA/ucontext: Add a core API for mmaping driver IO memory
authorJason Gunthorpe <jgg@mellanox.com>
Sun, 16 Sep 2018 17:43:08 +0000 (20:43 +0300)
committerDoug Ledford <dledford@redhat.com>
Thu, 20 Sep 2018 20:19:30 +0000 (16:19 -0400)
To support disassociation and PCI hot unplug, we have to track all the
VMAs that refer to the device IO memory. When disassociation occurs the
VMAs have to be revised to point to the zero page, not the IO memory, to
allow the physical HW to be unplugged.

The three drivers supporting this implemented three different versions
of this algorithm, all leaving something to be desired. This new common
implementation has a few differences from the driver versions:

- Track all VMAs, including splitting/truncating/etc. Tie the lifetime of
  the private data allocation to the lifetime of the vma. This avoids any
  tricks with setting vm_ops which Linus didn't like. (see link)
- Support multiple mms, and support properly tracking mmaps triggered by
  processes other than the one first opening the uverbs fd. This makes
  fork behavior of disassociation enabled drivers the same as fork support
  in normal drivers.
- Don't use crazy get_task stuff.
- Simplify the approach for to racing between vm_ops close and
  disassociation, fixing the related bugs most of the driver
  implementations had. Since we are in core code the tracking list can be
  placed in struct ib_uverbs_ufile, which has a lifetime strictly longer
  than any VMAs created by mmap on the uverbs FD.

Link: https://www.spinics.net/lists/stable/msg248747.html
Link: https://lkml.kernel.org/r/CA+55aFxJTV_g46AQPoPXen-UPiqR1HGMZictt7VpC-SMFbm3Cw@mail.gmail.com
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
drivers/infiniband/core/rdma_core.c
drivers/infiniband/core/rdma_core.h
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_main.c
include/rdma/ib_verbs.h

index c4118bcd5103565e3b20b6a970e67322061d195f..06d31fe5667740042f0c6b6f0d23e724e7fcfda9 100644 (file)
@@ -842,8 +842,10 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
        struct ib_ucontext *ucontext = ufile->ucontext;
        int ret;
 
-       if (reason == RDMA_REMOVE_DRIVER_REMOVE)
+       if (reason == RDMA_REMOVE_DRIVER_REMOVE) {
+               uverbs_user_mmap_disassociate(ufile);
                ufile_disassociate_ucontext(ucontext);
+       }
 
        put_pid(ucontext->tgid);
        ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device,
index f962f2a593bafcbb794cf5857575ed893b68539a..4886d2bba7c7f44475c412ac45e954560dbe7410 100644 (file)
@@ -160,5 +160,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi);
 void uverbs_destroy_api(struct uverbs_api *uapi);
 void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
                              unsigned int num_attrs);
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
 
 #endif /* RDMA_CORE_H */
index 24369eb66c67903a5048df830ca5827602b253ce..c97935a0c7c6ef2f9beb9c011bdd840dcbdcc27f 100644 (file)
@@ -158,6 +158,9 @@ struct ib_uverbs_file {
        spinlock_t              uobjects_lock;
        struct list_head        uobjects;
 
+       struct mutex umap_lock;
+       struct list_head umaps;
+
        u64 uverbs_cmd_mask;
        u64 uverbs_ex_cmd_mask;
 
index db6de915766878bc5263b1e0c70bb2f484784183..8d56773aac564e1fada6a1594e29f4d9d59cc44b 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/cdev.h>
 #include <linux/anon_inodes.h>
 #include <linux/slab.h>
+#include <linux/sched/mm.h>
 
 #include <linux/uaccess.h>
 
@@ -811,6 +812,226 @@ out:
        return ret;
 }
 
+/*
+ * Each time we map IO memory into user space this keeps track of the mapping.
+ * When the device is hot-unplugged we 'zap' the mmaps in user space to point
+ * to the zero page and allow the hot unplug to proceed.
+ *
+ * This is necessary for cases like PCI physical hot unplug as the actual BAR
+ * memory may vanish after this and access to it from userspace could MCE.
+ *
+ * RDMA drivers supporting disassociation must have their user space designed
+ * to cope in some way with their IO pages going to the zero page.
+ */
+struct rdma_umap_priv {
+       struct vm_area_struct *vma;
+       struct list_head list;
+};
+
+static const struct vm_operations_struct rdma_umap_ops;
+
+static void rdma_umap_priv_init(struct rdma_umap_priv *priv,
+                               struct vm_area_struct *vma)
+{
+       struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+
+       priv->vma = vma;
+       vma->vm_private_data = priv;
+       vma->vm_ops = &rdma_umap_ops;
+
+       mutex_lock(&ufile->umap_lock);
+       list_add(&priv->list, &ufile->umaps);
+       mutex_unlock(&ufile->umap_lock);
+}
+
+/*
+ * The VMA has been dup'd, initialize the vm_private_data with a new tracking
+ * struct
+ */
+static void rdma_umap_open(struct vm_area_struct *vma)
+{
+       struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+       struct rdma_umap_priv *opriv = vma->vm_private_data;
+       struct rdma_umap_priv *priv;
+
+       if (!opriv)
+               return;
+
+       /* We are racing with disassociation */
+       if (!down_read_trylock(&ufile->hw_destroy_rwsem))
+               goto out_zap;
+       /*
+        * Disassociation already completed, the VMA should already be zapped.
+        */
+       if (!ufile->ucontext)
+               goto out_unlock;
+
+       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               goto out_unlock;
+       rdma_umap_priv_init(priv, vma);
+
+       up_read(&ufile->hw_destroy_rwsem);
+       return;
+
+out_unlock:
+       up_read(&ufile->hw_destroy_rwsem);
+out_zap:
+       /*
+        * We can't allow the VMA to be created with the actual IO pages, that
+        * would break our API contract, and it can't be stopped at this
+        * point, so zap it.
+        */
+       vma->vm_private_data = NULL;
+       zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+static void rdma_umap_close(struct vm_area_struct *vma)
+{
+       struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+       struct rdma_umap_priv *priv = vma->vm_private_data;
+
+       if (!priv)
+               return;
+
+       /*
+        * The vma holds a reference on the struct file that created it, which
+        * in turn means that the ib_uverbs_file is guaranteed to exist at
+        * this point.
+        */
+       mutex_lock(&ufile->umap_lock);
+       list_del(&priv->list);
+       mutex_unlock(&ufile->umap_lock);
+       kfree(priv);
+}
+
+static const struct vm_operations_struct rdma_umap_ops = {
+       .open = rdma_umap_open,
+       .close = rdma_umap_close,
+};
+
+static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
+                                                struct vm_area_struct *vma,
+                                                unsigned long size)
+{
+       struct ib_uverbs_file *ufile = ucontext->ufile;
+       struct rdma_umap_priv *priv;
+
+       if (vma->vm_end - vma->vm_start != size)
+               return ERR_PTR(-EINVAL);
+
+       /* Driver is using this wrong, must be called by ib_uverbs_mmap */
+       if (WARN_ON(!vma->vm_file ||
+                   vma->vm_file->private_data != ufile))
+               return ERR_PTR(-EINVAL);
+       lockdep_assert_held(&ufile->device->disassociate_srcu);
+
+       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return ERR_PTR(-ENOMEM);
+       return priv;
+}
+
+/*
+ * Map IO memory into a process. This is to be called by drivers as part of
+ * their mmap() functions if they wish to send something like PCI-E BAR memory
+ * to userspace.
+ */
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+                     unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+       struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
+
+       if (IS_ERR(priv))
+               return PTR_ERR(priv);
+
+       vma->vm_page_prot = prot;
+       if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
+               kfree(priv);
+               return -EAGAIN;
+       }
+
+       rdma_umap_priv_init(priv, vma);
+       return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_io);
+
+/*
+ * The page case is here for a slightly different reason, the driver expects
+ * to be able to free the page it is sharing to user space when it destroys
+ * its ucontext, which means we need to zap the user space references.
+ *
+ * We could handle this differently by providing an API to allocate a shared
+ * page and then only freeing the shared page when the last ufile is
+ * destroyed.
+ */
+int rdma_user_mmap_page(struct ib_ucontext *ucontext,
+                       struct vm_area_struct *vma, struct page *page,
+                       unsigned long size)
+{
+       struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
+
+       if (IS_ERR(priv))
+               return PTR_ERR(priv);
+
+       if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size,
+                           vma->vm_page_prot)) {
+               kfree(priv);
+               return -EAGAIN;
+       }
+
+       rdma_umap_priv_init(priv, vma);
+       return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_page);
+
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
+{
+       struct rdma_umap_priv *priv, *next_priv;
+
+       lockdep_assert_held(&ufile->hw_destroy_rwsem);
+
+       while (1) {
+               struct mm_struct *mm = NULL;
+
+               /* Get an arbitrary mm pointer that hasn't been cleaned yet */
+               mutex_lock(&ufile->umap_lock);
+               if (!list_empty(&ufile->umaps)) {
+                       mm = list_first_entry(&ufile->umaps,
+                                             struct rdma_umap_priv, list)
+                                    ->vma->vm_mm;
+                       mmget(mm);
+               }
+               mutex_unlock(&ufile->umap_lock);
+               if (!mm)
+                       return;
+
+               /*
+                * The umap_lock is nested under mmap_sem since it used within
+                * the vma_ops callbacks, so we have to clean the list one mm
+                * at a time to get the lock ordering right. Typically there
+                * will only be one mm, so no big deal.
+                */
+               down_write(&mm->mmap_sem);
+               mutex_lock(&ufile->umap_lock);
+               list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
+                                         list) {
+                       struct vm_area_struct *vma = priv->vma;
+
+                       if (vma->vm_mm != mm)
+                               continue;
+                       list_del_init(&priv->list);
+
+                       zap_vma_ptes(vma, vma->vm_start,
+                                    vma->vm_end - vma->vm_start);
+                       vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
+               }
+               mutex_unlock(&ufile->umap_lock);
+               up_write(&mm->mmap_sem);
+               mmput(mm);
+       }
+}
+
 /*
  * ib_uverbs_open() does not need the BKL:
  *
@@ -872,6 +1093,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
        spin_lock_init(&file->uobjects_lock);
        INIT_LIST_HEAD(&file->uobjects);
        init_rwsem(&file->hw_destroy_rwsem);
+       mutex_init(&file->umap_lock);
+       INIT_LIST_HEAD(&file->umaps);
 
        filp->private_data = file;
        list_add_tail(&file->list, &dev->uverbs_file_list);
index e463d3007a356e9de7c99ab0cdc44bae52308632..a66238d8a2a3db92c1bbb7e8f25143cc570e460e 100644 (file)
@@ -2646,6 +2646,28 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
 void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
                         void *data);
 
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+                     unsigned long pfn, unsigned long size, pgprot_t prot);
+int rdma_user_mmap_page(struct ib_ucontext *ucontext,
+                       struct vm_area_struct *vma, struct page *page,
+                       unsigned long size);
+#else
+static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext,
+                                   struct vm_area_struct *vma,
+                                   unsigned long pfn, unsigned long size,
+                                   pgprot_t prot)
+{
+       return -EINVAL;
+}
+static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext,
+                               struct vm_area_struct *vma, struct page *page,
+                               unsigned long size)
+{
+       return -EINVAL;
+}
+#endif
+
 static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
 {
        return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;