.invalidate_range_end = ib_umem_notifier_invalidate_range_end,
};
-struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context,
- unsigned long addr, size_t size)
+static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
+{
+ struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
+ struct ib_umem *umem = &umem_odp->umem;
+
+ down_write(&per_mm->umem_rwsem);
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_insert(&umem_odp->interval_tree,
+ &per_mm->umem_tree);
+
+ if (likely(!atomic_read(&per_mm->notifier_count)))
+ umem_odp->mn_counters_active = true;
+ else
+ list_add(&umem_odp->no_private_counters,
+ &per_mm->no_private_counters);
+ up_write(&per_mm->umem_rwsem);
+}
+
+static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
+{
+ struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
+ struct ib_umem *umem = &umem_odp->umem;
+
+ down_write(&per_mm->umem_rwsem);
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_remove(&umem_odp->interval_tree,
+ &per_mm->umem_tree);
+ if (!umem_odp->mn_counters_active) {
+ list_del(&umem_odp->no_private_counters);
+ complete_all(&umem_odp->notifier_completion);
+ }
+
+ up_write(&per_mm->umem_rwsem);
+}
+
+static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
+ struct mm_struct *mm)
{
struct ib_ucontext_per_mm *per_mm;
+ int ret;
+
+ per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
+ if (!per_mm)
+ return ERR_PTR(-ENOMEM);
+
+ per_mm->context = ctx;
+ per_mm->mm = mm;
+ per_mm->umem_tree = RB_ROOT_CACHED;
+ init_rwsem(&per_mm->umem_rwsem);
+ INIT_LIST_HEAD(&per_mm->no_private_counters);
+
+ rcu_read_lock();
+ per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ rcu_read_unlock();
+
+ WARN_ON(mm != current->mm);
+
+ per_mm->mn.ops = &ib_umem_notifiers;
+ ret = mmu_notifier_register(&per_mm->mn, per_mm->mm);
+ if (ret) {
+ dev_err(&ctx->device->dev,
+ "Failed to register mmu_notifier %d\n", ret);
+ goto out_pid;
+ }
+
+ list_add(&per_mm->ucontext_list, &ctx->per_mm_list);
+ return per_mm;
+
+out_pid:
+ put_pid(per_mm->tgid);
+ kfree(per_mm);
+ return ERR_PTR(ret);
+}
+
+static int get_per_mm(struct ib_umem_odp *umem_odp)
+{
+ struct ib_ucontext *ctx = umem_odp->umem.context;
+ struct ib_ucontext_per_mm *per_mm;
+
+ /*
+ * Generally speaking we expect only one or two per_mm in this list,
+ * so no reason to optimize this search today.
+ */
+ mutex_lock(&ctx->per_mm_list_lock);
+ list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) {
+ if (per_mm->mm == umem_odp->umem.owning_mm)
+ goto found;
+ }
+
+ per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm);
+ if (IS_ERR(per_mm)) {
+ mutex_unlock(&ctx->per_mm_list_lock);
+ return PTR_ERR(per_mm);
+ }
+
+found:
+ umem_odp->per_mm = per_mm;
+ per_mm->odp_mrs_count++;
+ mutex_unlock(&ctx->per_mm_list_lock);
+
+ return 0;
+}
+
+void put_per_mm(struct ib_umem_odp *umem_odp)
+{
+ struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
+ struct ib_ucontext *ctx = umem_odp->umem.context;
+ bool need_free;
+
+ mutex_lock(&ctx->per_mm_list_lock);
+ umem_odp->per_mm = NULL;
+ per_mm->odp_mrs_count--;
+ need_free = per_mm->odp_mrs_count == 0;
+ if (need_free)
+ list_del(&per_mm->ucontext_list);
+ mutex_unlock(&ctx->per_mm_list_lock);
+
+ if (!need_free)
+ return;
+
+ mmu_notifier_unregister(&per_mm->mn, per_mm->mm);
+ put_pid(per_mm->tgid);
+ kfree(per_mm);
+}
+
+struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
+ unsigned long addr, size_t size)
+{
+ struct ib_ucontext *ctx = per_mm->context;
struct ib_umem_odp *odp_data;
struct ib_umem *umem;
int pages = size >> PAGE_SHIFT;
if (!odp_data)
return ERR_PTR(-ENOMEM);
umem = &odp_data->umem;
- umem->context = context;
+ umem->context = ctx;
umem->length = size;
umem->address = addr;
umem->page_shift = PAGE_SHIFT;
umem->writable = 1;
umem->is_odp = 1;
- odp_data->per_mm = per_mm = &context->per_mm;
+ odp_data->per_mm = per_mm;
mutex_init(&odp_data->umem_mutex);
init_completion(&odp_data->notifier_completion);
goto out_page_list;
}
- down_write(&per_mm->umem_rwsem);
+ /*
+ * Caller must ensure that the umem_odp that the per_mm came from
+ * cannot be freed during the call to ib_alloc_odp_umem.
+ */
+ mutex_lock(&ctx->per_mm_list_lock);
per_mm->odp_mrs_count++;
- rbt_ib_umem_insert(&odp_data->interval_tree, &per_mm->umem_tree);
- if (likely(!atomic_read(&per_mm->notifier_count)))
- odp_data->mn_counters_active = true;
- else
- list_add(&odp_data->no_private_counters,
- &per_mm->no_private_counters);
- up_write(&per_mm->umem_rwsem);
+ mutex_unlock(&ctx->per_mm_list_lock);
+ add_umem_to_per_mm(odp_data);
return odp_data;
int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
{
- struct ib_ucontext *context = umem_odp->umem.context;
struct ib_umem *umem = &umem_odp->umem;
- struct ib_ucontext_per_mm *per_mm;
+ /*
+ * NOTE: This must called in a process context where umem->owning_mm
+ * == current->mm
+ */
+ struct mm_struct *mm = umem->owning_mm;
int ret_val;
- struct pid *our_pid;
- struct mm_struct *mm = get_task_mm(current);
-
- if (!mm)
- return -EINVAL;
if (access & IB_ACCESS_HUGETLB) {
struct vm_area_struct *vma;
umem->hugetlb = 0;
}
- /* Prevent creating ODP MRs in child processes */
- rcu_read_lock();
- our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
- rcu_read_unlock();
- put_pid(our_pid);
- if (context->tgid != our_pid) {
- ret_val = -EINVAL;
- goto out_mm;
- }
-
mutex_init(&umem_odp->umem_mutex);
init_completion(&umem_odp->notifier_completion);
umem_odp->page_list =
vzalloc(array_size(sizeof(*umem_odp->page_list),
ib_umem_num_pages(umem)));
- if (!umem_odp->page_list) {
- ret_val = -ENOMEM;
- goto out_mm;
- }
+ if (!umem_odp->page_list)
+ return -ENOMEM;
umem_odp->dma_list =
vzalloc(array_size(sizeof(*umem_odp->dma_list),
}
}
- /*
- * When using MMU notifiers, we will get a
- * notification before the "current" task (and MM) is
- * destroyed. We use the umem_rwsem semaphore to synchronize.
- */
- umem_odp->per_mm = per_mm = &context->per_mm;
-
- down_write(&per_mm->umem_rwsem);
- per_mm->odp_mrs_count++;
- if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
- rbt_ib_umem_insert(&umem_odp->interval_tree,
- &per_mm->umem_tree);
- if (likely(!atomic_read(&per_mm->notifier_count)) ||
- per_mm->odp_mrs_count == 1)
- umem_odp->mn_counters_active = true;
- else
- list_add(&umem_odp->no_private_counters,
- &per_mm->no_private_counters);
- downgrade_write(&per_mm->umem_rwsem);
+ ret_val = get_per_mm(umem_odp);
+ if (ret_val)
+ goto out_dma_list;
+ add_umem_to_per_mm(umem_odp);
- if (per_mm->odp_mrs_count == 1) {
- /*
- * Note that at this point, no MMU notifier is running
- * for this per_mm!
- */
- atomic_set(&per_mm->notifier_count, 0);
- INIT_HLIST_NODE(&per_mm->mn.hlist);
- per_mm->mn.ops = &ib_umem_notifiers;
- ret_val = mmu_notifier_register(&per_mm->mn, mm);
- if (ret_val) {
- pr_err("Failed to register mmu_notifier %d\n", ret_val);
- ret_val = -EBUSY;
- goto out_mutex;
- }
- }
-
- up_read(&per_mm->umem_rwsem);
-
- /*
- * Note that doing an mmput can cause a notifier for the relevant mm.
- * If the notifier is called while we hold the umem_rwsem, this will
- * cause a deadlock. Therefore, we release the reference only after we
- * released the semaphore.
- */
- mmput(mm);
return 0;
-out_mutex:
- up_read(&per_mm->umem_rwsem);
+out_dma_list:
vfree(umem_odp->dma_list);
out_page_list:
vfree(umem_odp->page_list);
-out_mm:
- mmput(mm);
return ret_val;
}
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
struct ib_umem *umem = &umem_odp->umem;
- struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
/*
* Ensure that no more pages are mapped in the umem.
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem),
ib_umem_end(umem));
- down_write(&per_mm->umem_rwsem);
- if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
- rbt_ib_umem_remove(&umem_odp->interval_tree,
- &per_mm->umem_tree);
- per_mm->odp_mrs_count--;
- if (!umem_odp->mn_counters_active) {
- list_del(&umem_odp->no_private_counters);
- complete_all(&umem_odp->notifier_completion);
- }
-
- /*
- * Downgrade the lock to a read lock. This ensures that the notifiers
- * (who lock the mutex for reading) will be able to finish, and we
- * will be able to enventually obtain the mmu notifiers SRCU. Note
- * that since we are doing it atomically, no other user could register
- * and unregister while we do the check.
- */
- downgrade_write(&per_mm->umem_rwsem);
- if (!per_mm->odp_mrs_count) {
- struct task_struct *owning_process = NULL;
- struct mm_struct *owning_mm = NULL;
-
- owning_process =
- get_pid_task(umem_odp->umem.context->tgid, PIDTYPE_PID);
- if (owning_process == NULL)
- /*
- * The process is already dead, notifier were removed
- * already.
- */
- goto out;
-
- owning_mm = get_task_mm(owning_process);
- if (owning_mm == NULL)
- /*
- * The process' mm is already dead, notifier were
- * removed already.
- */
- goto out_put_task;
- mmu_notifier_unregister(&per_mm->mn, owning_mm);
-
- mmput(owning_mm);
-
-out_put_task:
- put_task_struct(owning_process);
- }
-out:
- up_read(&per_mm->umem_rwsem);
-
+ remove_umem_from_per_mm(umem_odp);
+ put_per_mm(umem_odp);
vfree(umem_odp->dma_list);
vfree(umem_odp->page_list);
}
{
struct ib_umem *umem = &umem_odp->umem;
struct task_struct *owning_process = NULL;
- struct mm_struct *owning_mm = NULL;
+ struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
struct page **local_page_list = NULL;
u64 page_mask, off;
int j, k, ret = 0, start_idx, npages = 0, page_shift;
user_virt = user_virt & page_mask;
bcnt += off; /* Charge for the first page offset as well. */
- owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
- if (owning_process == NULL) {
+ /*
+ * owning_process is allowed to be NULL, this means somehow the mm is
+ * existing beyond the lifetime of the originating process.. Presumably
+ * mmget_not_zero will fail in this case.
+ */
+ owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
+ if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) {
ret = -EINVAL;
- goto out_no_task;
- }
-
- owning_mm = get_task_mm(owning_process);
- if (owning_mm == NULL) {
- ret = -ENOENT;
goto out_put_task;
}
mmput(owning_mm);
out_put_task:
- put_task_struct(owning_process);
-out_no_task:
+ if (owning_process)
+ put_task_struct(owning_process);
free_page((unsigned long)local_page_list);
return ret;
}