drm/i915/hangcheck: Prevent long walks across full-ppgtt
authorMika Kuoppala <mika.kuoppala@linux.intel.com>
Wed, 2 Mar 2016 14:48:29 +0000 (16:48 +0200)
committerMika Kuoppala <mika.kuoppala@intel.com>
Fri, 4 Mar 2016 13:17:14 +0000 (15:17 +0200)
With full-ppgtt, it takes the GPU an eon to traverse the entire 256PiB
address space, causing a loop to be detected. Under the current scheme,
if ACTHD walks off the end of a batch buffer and into an empty
address space, we "never" detect the hang. If we always increment the
score as the ACTHD is progressing then we will eventually timeout (after
~46.5s (31 * 1.5s) without advancing onto a new batch). To counter act
this, increase the amount we reduce the score for good batches, so that
only a series of almost-bad batches trigger a full reset. DoS detection
suffers slightly but series of long running shader tests will benefit.

Based on a patch from Chris Wilson.

Testcase: igt/drv_hangman/hangcheck-unterminated
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1456930109-21532-1-git-send-email-mika.kuoppala@intel.com
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_irq.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index a0f1bd711b533910ce00a2bbe642591419c4d198..15aacd0ee66fd78ccbe60a158df6239e02d05be2 100644 (file)
@@ -1367,8 +1367,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
                seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
                           (long long)ring->hangcheck.acthd,
                           (long long)acthd[i]);
-               seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
-                          (long long)ring->hangcheck.max_acthd);
                seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
                seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
 
index 3b6bfbf354820b896eff2c415353e48c26988ad9..13b5f3aed01c80b929c0e2dc52bf653c2671d894 100644 (file)
@@ -230,8 +230,6 @@ static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
                return "wait";
        case HANGCHECK_ACTIVE:
                return "active";
-       case HANGCHECK_ACTIVE_LOOP:
-               return "active (loop)";
        case HANGCHECK_KICK:
                return "kick";
        case HANGCHECK_HUNG:
index d1a46ef5ab3f4b051b42599805266cd5fbd2bc7e..53e5104964b3ae98a66f6db5af47d1fd614c8504 100644 (file)
@@ -3001,12 +3001,7 @@ head_stuck(struct intel_engine_cs *ring, u64 acthd)
                memset(ring->hangcheck.instdone, 0,
                       sizeof(ring->hangcheck.instdone));
 
-               if (acthd > ring->hangcheck.max_acthd) {
-                       ring->hangcheck.max_acthd = acthd;
-                       return HANGCHECK_ACTIVE;
-               }
-
-               return HANGCHECK_ACTIVE_LOOP;
+               return HANGCHECK_ACTIVE;
        }
 
        if (!subunits_stuck(ring))
@@ -3083,6 +3078,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
+#define ACTIVE_DECAY 15
 
        if (!i915.enable_hangcheck)
                return;
@@ -3151,9 +3147,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
                                switch (ring->hangcheck.action) {
                                case HANGCHECK_IDLE:
                                case HANGCHECK_WAIT:
-                               case HANGCHECK_ACTIVE:
                                        break;
-                               case HANGCHECK_ACTIVE_LOOP:
+                               case HANGCHECK_ACTIVE:
                                        ring->hangcheck.score += BUSY;
                                        break;
                                case HANGCHECK_KICK:
@@ -3172,10 +3167,12 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
                         * attempts across multiple batches.
                         */
                        if (ring->hangcheck.score > 0)
-                               ring->hangcheck.score--;
+                               ring->hangcheck.score -= ACTIVE_DECAY;
+                       if (ring->hangcheck.score < 0)
+                               ring->hangcheck.score = 0;
 
                        /* Clear head and subunit states on seqno movement */
-                       ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
+                       ring->hangcheck.acthd = 0;
 
                        memset(ring->hangcheck.instdone, 0,
                               sizeof(ring->hangcheck.instdone));
index dd910d30a380570cafd0107b5a03069571af3fe2..4b1439deb7fe76180865acb15ac49a4c04d768db 100644 (file)
@@ -79,7 +79,6 @@ enum intel_ring_hangcheck_action {
        HANGCHECK_IDLE = 0,
        HANGCHECK_WAIT,
        HANGCHECK_ACTIVE,
-       HANGCHECK_ACTIVE_LOOP,
        HANGCHECK_KICK,
        HANGCHECK_HUNG,
 };
@@ -88,7 +87,6 @@ enum intel_ring_hangcheck_action {
 
 struct intel_ring_hangcheck {
        u64 acthd;
-       u64 max_acthd;
        u32 seqno;
        int score;
        enum intel_ring_hangcheck_action action;