drm/gpu-sched: fix force APP kill hang(v4)

author Emily Deng <Emily.Deng@amd.com>

Mon, 16 Apr 2018 02:07:02 +0000 (10:07 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 15 May 2018 18:43:17 +0000 (13:43 -0500)
author Emily Deng <Emily.Deng@amd.com>
Mon, 16 Apr 2018 02:07:02 +0000 (10:07 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 15 May 2018 18:43:17 +0000 (13:43 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index c25ee750c362b6e59088990dd0af4b2a684d0eb9..ea1b28536bfcb2d4e2c29793ca54392197c309bf 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
  int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id);
  
  void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
+void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr);
+void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
  
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

index 09d35051fdd68689ac00d69504765166e738076f..eb80edfb1b0a7a33e16f4dbc2db5a4795727b9d7 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -111,8 +111,9 @@ failed:
         return r;
  }
  
-static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
+static void amdgpu_ctx_fini(struct kref *ref)
  {
+       struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
         struct amdgpu_device *adev = ctx->adev;
         unsigned i, j;
  
@@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
         kfree(ctx->fences);
         ctx->fences = NULL;
  
-       for (i = 0; i < adev->num_rings; i++)
-               drm_sched_entity_fini(&adev->rings[i]->sched,
-                                     &ctx->rings[i].entity);
-
         amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
  
         mutex_destroy(&ctx->lock);
+
+       kfree(ctx);
  }
  
  static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
@@ -170,12 +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
  static void amdgpu_ctx_do_release(struct kref *ref)
  {
         struct amdgpu_ctx *ctx;
+       u32 i;
  
         ctx = container_of(ref, struct amdgpu_ctx, refcount);
  
-       amdgpu_ctx_fini(ctx);
+       for (i = 0; i < ctx->adev->num_rings; i++)
+               drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
+                       &ctx->rings[i].entity);
  
-       kfree(ctx);
+       amdgpu_ctx_fini(ref);
  }
  
  static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
@@ -435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
         idr_init(&mgr->ctx_handles);
  }
  
+void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
+{
+       struct amdgpu_ctx *ctx;
+       struct idr *idp;
+       uint32_t id, i;
+
+       idp = &mgr->ctx_handles;
+
+       idr_for_each_entry(idp, ctx, id) {
+
+               if (!ctx->adev)
+                       return;
+
+               for (i = 0; i < ctx->adev->num_rings; i++)
+                       if (kref_read(&ctx->refcount) == 1)
+                               drm_sched_entity_do_release(&ctx->adev->rings[i]->sched,
+                                                 &ctx->rings[i].entity);
+                       else
+                               DRM_ERROR("ctx %p is still alive\n", ctx);
+       }
+}
+
+void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr)
+{
+       struct amdgpu_ctx *ctx;
+       struct idr *idp;
+       uint32_t id, i;
+
+       idp = &mgr->ctx_handles;
+
+       idr_for_each_entry(idp, ctx, id) {
+
+               if (!ctx->adev)
+                       return;
+
+               for (i = 0; i < ctx->adev->num_rings; i++)
+                       if (kref_read(&ctx->refcount) == 1)
+                               drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched,
+                                       &ctx->rings[i].entity);
+                       else
+                               DRM_ERROR("ctx %p is still alive\n", ctx);
+       }
+}
+
  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
  {
         struct amdgpu_ctx *ctx;
         struct idr *idp;
         uint32_t id;
  
+       amdgpu_ctx_mgr_entity_cleanup(mgr);
+
         idp = &mgr->ctx_handles;
  
         idr_for_each_entry(idp, ctx, id) {
-               if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
+               if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
                         DRM_ERROR("ctx %p is still alive\n", ctx);
         }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

index bd9e723dbb2b62f8c8bad4ec1add7f82a1425880..1ed379524117f33d18426bd58eb3b059e60686ee 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
                 return;
  
         pm_runtime_get_sync(dev->dev);
-
-       amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
+       amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
  
         if (adev->asic_type != CHIP_RAVEN) {
                 amdgpu_uvd_free_handles(adev, file_priv);
@@ -935,6 +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
         pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
  
         amdgpu_vm_fini(adev, &fpriv->vm);
+       amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
+
         if (pasid)
                 amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
         amdgpu_bo_unref(&pd);
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c

index 310275eaf128d32bda2994d07bd86df49f900608..44d21981bf3bc123e133de4e91883ca32fe37558 100644 (file)
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -136,6 +136,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
         entity->rq = rq;
         entity->sched = sched;
         entity->guilty = guilty;
+       entity->fini_status = 0;
+       entity->last_scheduled = NULL;
  
         spin_lock_init(&entity->rq_lock);
         spin_lock_init(&entity->queue_lock);
@@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
         return true;
  }
  
+static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
+                                   struct dma_fence_cb *cb)
+{
+       struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+                                                finish_cb);
+       drm_sched_fence_finished(job->s_fence);
+       WARN_ON(job->s_fence->parent);
+       dma_fence_put(&job->s_fence->finished);
+       job->sched->ops->free_job(job);
+}
+
+
  /**
   * Destroy a context entity
   *
   * @sched       Pointer to scheduler instance
   * @entity     The pointer to a valid scheduler entity
   *
- * Cleanup and free the allocated resources.
+ * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting,
+ * removes the entity from the runqueue and returns an error when the process was killed.
   */
-void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
                            struct drm_sched_entity *entity)
  {
-       int r;
-
         if (!drm_sched_entity_is_initialized(sched, entity))
                 return;
         /**
@@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
          * queued IBs or discard them on SIGKILL
         */
         if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
-               r = -ERESTARTSYS;
+               entity->fini_status = -ERESTARTSYS;
         else
-               r = wait_event_killable(sched->job_scheduled,
+               entity->fini_status = wait_event_killable(sched->job_scheduled,
                                         drm_sched_entity_is_idle(entity));
         drm_sched_entity_set_rq(entity, NULL);
-       if (r) {
+}
+EXPORT_SYMBOL(drm_sched_entity_do_release);
+
+/**
+ * Destroy a context entity
+ *
+ * @sched       Pointer to scheduler instance
+ * @entity     The pointer to a valid scheduler entity
+ *
+ * The second one then goes over the entity and signals all jobs with an error code.
+ */
+void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
+                          struct drm_sched_entity *entity)
+{
+       if (entity->fini_status) {
                 struct drm_sched_job *job;
+               int r;
  
                 /* Park the kernel for a moment to make sure it isn't processing
                  * our enity.
@@ -241,13 +269,26 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
                         struct drm_sched_fence *s_fence = job->s_fence;
                         drm_sched_fence_scheduled(s_fence);
                         dma_fence_set_error(&s_fence->finished, -ESRCH);
-                       drm_sched_fence_finished(s_fence);
-                       WARN_ON(s_fence->parent);
-                       dma_fence_put(&s_fence->finished);
-                       sched->ops->free_job(job);
+                       r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
+                                                       drm_sched_entity_kill_jobs_cb);
+                       if (r == -ENOENT)
+                               drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
+                       else if (r)
+                               DRM_ERROR("fence add callback failed (%d)\n", r);
                 }
+
+               dma_fence_put(entity->last_scheduled);
+               entity->last_scheduled = NULL;
         }
  }
+EXPORT_SYMBOL(drm_sched_entity_cleanup);
+
+void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+                               struct drm_sched_entity *entity)
+{
+       drm_sched_entity_do_release(sched, entity);
+       drm_sched_entity_cleanup(sched, entity);
+}
  EXPORT_SYMBOL(drm_sched_entity_fini);
  
  static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
@@ -530,6 +571,10 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
                 spin_unlock(&sched->job_list_lock);
                 fence = sched->ops->run_job(s_job);
                 atomic_inc(&sched->hw_rq_count);
+
+               dma_fence_put(s_job->entity->last_scheduled);
+               s_job->entity->last_scheduled = dma_fence_get(&s_fence->finished);
+
                 if (fence) {
                         s_fence->parent = dma_fence_get(fence);
                         r = dma_fence_add_callback(fence, &s_fence->cb,
@@ -556,6 +601,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
                        void *owner)
  {
         job->sched = sched;
+       job->entity = entity;
         job->s_priority = entity->rq - sched->sched_rq;
         job->s_fence = drm_sched_fence_create(entity, owner);
         if (!job->s_fence)
@@ -669,6 +715,9 @@ static int drm_sched_main(void *param)
                 fence = sched->ops->run_job(sched_job);
                 drm_sched_fence_scheduled(s_fence);
  
+               dma_fence_put(entity->last_scheduled);
+               entity->last_scheduled = dma_fence_get(&s_fence->finished);
+
                 if (fence) {
                         s_fence->parent = dma_fence_get(fence);
                         r = dma_fence_add_callback(fence, &s_fence->cb,
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h

index c053a32341bfe6156415f424e45516d4eb6881d6..350a62c26b296f628ec1463800c01c853b3af2e5 100644 (file)
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -65,6 +65,8 @@ struct drm_sched_entity {
         struct dma_fence                *dependency;
         struct dma_fence_cb             cb;
         atomic_t                        *guilty; /* points to ctx's guilty */
+       int            fini_status;
+       struct dma_fence    *last_scheduled;
  };
  
  /**
@@ -119,6 +121,7 @@ struct drm_sched_job {
         uint64_t                        id;
         atomic_t                        karma;
         enum drm_sched_priority         s_priority;
+       struct drm_sched_entity  *entity;
  };
  
  static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
@@ -186,6 +189,10 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
                           struct drm_sched_entity *entity,
                           struct drm_sched_rq *rq,
                           uint32_t jobs, atomic_t *guilty);
+void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
+                          struct drm_sched_entity *entity);
+void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
+                          struct drm_sched_entity *entity);
  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
                            struct drm_sched_entity *entity);
  void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
author	Emily Deng <Emily.Deng@amd.com>
	Mon, 16 Apr 2018 02:07:02 +0000 (10:07 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 15 May 2018 18:43:17 +0000 (13:43 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c		patch \| blob \| history
drivers/gpu/drm/scheduler/gpu_scheduler.c		patch \| blob \| history
include/drm/gpu_scheduler.h		patch \| blob \| history