drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2
authorxinhui pan <xinhui.pan@amd.com>
Mon, 17 Dec 2018 06:31:12 +0000 (14:31 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 19 Mar 2019 20:36:51 +0000 (15:36 -0500)
Add AMDGPU_CTX_QUERY2_FLAGS_RAS_CE/UE which indicate if any error happened
between previous query and this query.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
include/uapi/drm/amdgpu_drm.h

index 7b526593eb77b46050aa435e677752c66ff48512..736ed1d67ec2abf303d70c9a0e8f0bbb429723fe 100644 (file)
@@ -26,6 +26,7 @@
 #include <drm/drm_auth.h>
 #include "amdgpu.h"
 #include "amdgpu_sched.h"
+#include "amdgpu_ras.h"
 
 #define to_amdgpu_ctx_entity(e)        \
        container_of((e), struct amdgpu_ctx_entity, entity)
@@ -344,6 +345,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 {
        struct amdgpu_ctx *ctx;
        struct amdgpu_ctx_mgr *mgr;
+       uint32_t ras_counter;
 
        if (!fpriv)
                return -EINVAL;
@@ -368,6 +370,21 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
        if (atomic_read(&ctx->guilty))
                out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
+       /*query ue count*/
+       ras_counter = amdgpu_ras_query_error_count(adev, false);
+       /*ras counter is monotonic increasing*/
+       if (ras_counter != ctx->ras_counter_ue) {
+               out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
+               ctx->ras_counter_ue = ras_counter;
+       }
+
+       /*query ce count*/
+       ras_counter = amdgpu_ras_query_error_count(adev, true);
+       if (ras_counter != ctx->ras_counter_ce) {
+               out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
+               ctx->ras_counter_ce = ras_counter;
+       }
+
        mutex_unlock(&mgr->lock);
        return 0;
 }
index b3b012c0a7da91a7153b905aa7d25dab1c25b876..8e561daa64cb311ba3cb7fdda90f6ca1dd2c8f3e 100644 (file)
@@ -49,6 +49,8 @@ struct amdgpu_ctx {
        enum drm_sched_priority         override_priority;
        struct mutex                    lock;
        atomic_t                        guilty;
+       uint32_t                        ras_counter_ce;
+       uint32_t                        ras_counter_ue;
 };
 
 struct amdgpu_ctx_mgr {
index e5275d4481f56be14f5de3c8584b5fc32b40a198..722598b25f37ff02adf71c29d03db9c761d3f663 100644 (file)
@@ -210,6 +210,9 @@ union drm_amdgpu_bo_list {
 #define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1)
 /* indicate some job from this context once cause gpu hang */
 #define AMDGPU_CTX_QUERY2_FLAGS_GUILTY   (1<<2)
+/* indicate some errors are detected by RAS */
+#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE   (1<<3)
+#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE   (1<<4)
 
 /* Context priority level */
 #define AMDGPU_CTX_PRIORITY_UNSET       -2048