drm/amdkfd: fix cp hang in eviction
authorEric Huang <JinhuiEric.Huang@amd.com>
Tue, 9 Jul 2019 19:33:53 +0000 (15:33 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 11 Jul 2019 19:37:24 +0000 (14:37 -0500)
The cp hang occurs in OCL conformance test only on supermicro
platform which has 40 cores and the test generates 40 threads.
The root cause is race condition in non-protected flags.

The fix is to add flags of is_evicted and is_active(init_mqd())
into protected area.

Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index 584748c23f14a68da105b563bccf83ce4011f09d..e6a4288bfaa6ab5824ca672653875c49144be803 100644 (file)
@@ -1157,12 +1157,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
        mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
                        q->properties.type)];
-       /*
-        * Eviction state logic: mark all queues as evicted, even ones
-        * not currently active. Restoring inactive queues later only
-        * updates the is_evicted flag but is a no-op otherwise.
-        */
-       q->properties.is_evicted = !!qpd->evicted;
+
        if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
                q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
                dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
@@ -1173,9 +1168,16 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
                retval = -ENOMEM;
                goto out_deallocate_doorbell;
        }
+
+       dqm_lock(dqm);
+       /*
+        * Eviction state logic: mark all queues as evicted, even ones
+        * not currently active. Restoring inactive queues later only
+        * updates the is_evicted flag but is a no-op otherwise.
+        */
+       q->properties.is_evicted = !!qpd->evicted;
        mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
                                &q->gart_mqd_addr, &q->properties);
-       dqm_lock(dqm);
 
        list_add(&q->list, &qpd->queues_list);
        qpd->queue_count++;