drm/i915/execlists: Preemption!

author Chris Wilson <chris@chris-wilson.co.uk>

Tue, 3 Oct 2017 20:34:52 +0000 (21:34 +0100)

committer Chris Wilson <chris@chris-wilson.co.uk>

Wed, 4 Oct 2017 16:52:46 +0000 (17:52 +0100)
author Chris Wilson <chris@chris-wilson.co.uk>
Tue, 3 Oct 2017 20:34:52 +0000 (21:34 +0100)
committer Chris Wilson <chris@chris-wilson.co.uk>
Wed, 4 Oct 2017 16:52:46 +0000 (17:52 +0100)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c

index 00633280570226ca255296503d1463739e1ab71d..7614880edad848d708cb9f998b6c828c14e323dd 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -368,9 +368,16 @@ static int i915_getparam(struct drm_device *dev, void *data,
                 break;
         case I915_PARAM_HAS_SCHEDULER:
                 value = 0;
-               if (dev_priv->engine[RCS] && dev_priv->engine[RCS]->schedule)
+               if (dev_priv->engine[RCS] && dev_priv->engine[RCS]->schedule) {
                         value |= I915_SCHEDULER_CAP_ENABLED;
+
+                       if (INTEL_INFO(dev_priv)->has_logical_ring_preemption &&
+                           i915_modparams.enable_execlists &&
+                           !i915_modparams.enable_guc_submission)
+                               value |= I915_SCHEDULER_CAP_PREEMPTION;
+               }
                 break;
+
         case I915_PARAM_MMAP_VERSION:
                 /* Remember to bump this if the version changes! */
         case I915_PARAM_HAS_GEM:
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c

index e5997e81867365c493a320b41cbead5944e3efcf..de777139f6a1b4cb82f343180c3a95a2656e5856 100644 (file)
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1382,10 +1382,8 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
         bool tasklet = false;
  
         if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
-               if (port_count(&execlists->port[0])) {
-                       __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-                       tasklet = true;
-               }
+               __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+               tasklet = true;
         }
  
         if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c

index d3a6fe5e3c10e5bb57391bc60264bc5d78b3e896..745b6a6e018879b7089f81b44adc90884df68355 100644 (file)
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -424,6 +424,7 @@ static const struct intel_device_info intel_cherryview_info __initconst = {
  
  #define GEN9_FEATURES \
         GEN8_FEATURES, \
+       .has_logical_ring_preemption = 1, \
         .has_csr = 1, \
         .has_guc = 1, \
         .has_ipc = 1, \
@@ -477,6 +478,7 @@ static const struct intel_device_info intel_skylake_gt4_info __initconst = {
         .has_rc6 = 1, \
         .has_dp_mst = 1, \
         .has_logical_ring_contexts = 1, \
+       .has_logical_ring_preemption = 1, \
         .has_guc = 1, \
         .has_aliasing_ppgtt = 1, \
         .has_full_ppgtt = 1, \
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c

index 5821762d90078d6b6f5f53f85928ff4a7d38dc15..c5b76082d695be8f466e716ff28ec917f66775d4 100644 (file)
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -208,9 +208,9 @@
  
  /* Typical size of the average request (2 pipecontrols and a MI_BB) */
  #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
-
  #define WA_TAIL_DWORDS 2
  #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
+#define PREEMPT_ID 0x1
  
  static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
                                             struct intel_engine_cs *engine);
@@ -429,6 +429,12 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
         return ce->lrc_desc;
  }
  
+static inline void elsp_write(u64 desc, u32 __iomem *elsp)
+{
+       writel(upper_32_bits(desc), elsp);
+       writel(lower_32_bits(desc), elsp);
+}
+
  static void execlists_submit_ports(struct intel_engine_cs *engine)
  {
         struct execlist_port *port = engine->execlists.port;
@@ -454,8 +460,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
                         desc = 0;
                 }
  
-               writel(upper_32_bits(desc), elsp);
-               writel(lower_32_bits(desc), elsp);
+               elsp_write(desc, elsp);
         }
  }
  
@@ -488,26 +493,43 @@ static void port_assign(struct execlist_port *port,
         port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
  }
  
+static void inject_preempt_context(struct intel_engine_cs *engine)
+{
+       struct intel_context *ce =
+               &engine->i915->preempt_context->engine[engine->id];
+       u32 __iomem *elsp =
+               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+       unsigned int n;
+
+       GEM_BUG_ON(engine->i915->preempt_context->hw_id != PREEMPT_ID);
+       GEM_BUG_ON(!IS_ALIGNED(ce->ring->size, WA_TAIL_BYTES));
+
+       memset(ce->ring->vaddr + ce->ring->tail, 0, WA_TAIL_BYTES);
+       ce->ring->tail += WA_TAIL_BYTES;
+       ce->ring->tail &= (ce->ring->size - 1);
+       ce->lrc_reg_state[CTX_RING_TAIL+1] = ce->ring->tail;
+
+       for (n = execlists_num_ports(&engine->execlists); --n; )
+               elsp_write(0, elsp);
+
+       elsp_write(ce->lrc_desc, elsp);
+}
+
+static bool can_preempt(struct intel_engine_cs *engine)
+{
+       return INTEL_INFO(engine->i915)->has_logical_ring_preemption;
+}
+
  static void execlists_dequeue(struct intel_engine_cs *engine)
  {
-       struct drm_i915_gem_request *last;
         struct intel_engine_execlists * const execlists = &engine->execlists;
         struct execlist_port *port = execlists->port;
         const struct execlist_port * const last_port =
                 &execlists->port[execlists->port_mask];
+       struct drm_i915_gem_request *last = port_request(port);
         struct rb_node *rb;
         bool submit = false;
  
-       last = port_request(port);
-       if (last)
-               /* WaIdleLiteRestore:bdw,skl
-                * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
-                * as we resubmit the request. See gen8_emit_breadcrumb()
-                * for where we prepare the padding after the end of the
-                * request.
-                */
-               last->tail = last->wa_tail;
-
         /* Hardware submission is through 2 ports. Conceptually each port
          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
          * static for a context, and unique to each, so we only execute
@@ -532,7 +554,65 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
         spin_lock_irq(&engine->timeline->lock);
         rb = execlists->first;
         GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-       while (rb) {
+       if (!rb)
+               goto unlock;
+
+       if (last) {
+               /*
+                * Don't resubmit or switch until all outstanding
+                * preemptions (lite-restore) are seen. Then we
+                * know the next preemption status we see corresponds
+                * to this ELSP update.
+                */
+               if (port_count(&port[0]) > 1)
+                       goto unlock;
+
+               if (can_preempt(engine) &&
+                   rb_entry(rb, struct i915_priolist, node)->priority >
+                   max(last->priotree.priority, 0)) {
+                       /*
+                        * Switch to our empty preempt context so
+                        * the state of the GPU is known (idle).
+                        */
+                       inject_preempt_context(engine);
+                       execlists->preempt = true;
+                       goto unlock;
+               } else {
+                       /*
+                        * In theory, we could coalesce more requests onto
+                        * the second port (the first port is active, with
+                        * no preemptions pending). However, that means we
+                        * then have to deal with the possible lite-restore
+                        * of the second port (as we submit the ELSP, there
+                        * may be a context-switch) but also we may complete
+                        * the resubmission before the context-switch. Ergo,
+                        * coalescing onto the second port will cause a
+                        * preemption event, but we cannot predict whether
+                        * that will affect port[0] or port[1].
+                        *
+                        * If the second port is already active, we can wait
+                        * until the next context-switch before contemplating
+                        * new requests. The GPU will be busy and we should be
+                        * able to resubmit the new ELSP before it idles,
+                        * avoiding pipeline bubbles (momentary pauses where
+                        * the driver is unable to keep up the supply of new
+                        * work).
+                        */
+                       if (port_count(&port[1]))
+                               goto unlock;
+
+                       /* WaIdleLiteRestore:bdw,skl
+                        * Apply the wa NOOPs to prevent
+                        * ring:HEAD == req:TAIL as we resubmit the
+                        * request. See gen8_emit_breadcrumb() for
+                        * where we prepare the padding after the
+                        * end of the request.
+                        */
+                       last->tail = last->wa_tail;
+               }
+       }
+
+       do {
                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
                 struct drm_i915_gem_request *rq, *rn;
  
@@ -595,11 +675,12 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
                 INIT_LIST_HEAD(&p->requests);
                 if (p->priority != I915_PRIORITY_NORMAL)
                         kmem_cache_free(engine->i915->priorities, p);
-       }
+       } while (rb);
  done:
         execlists->first = rb;
         if (submit)
                 port_assign(port, last);
+unlock:
         spin_unlock_irq(&engine->timeline->lock);
  
         if (submit)
@@ -680,13 +761,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
         spin_unlock_irqrestore(&engine->timeline->lock, flags);
  }
  
-static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
-{
-       const struct execlist_port *port = engine->execlists.port;
-
-       return port_count(&port[0]) + port_count(&port[1]) < 2;
-}
-
  /*
   * Check the unread Context Status Buffers and manage the submission of new
   * contexts to the ELSP accordingly.
@@ -695,7 +769,7 @@ static void intel_lrc_irq_handler(unsigned long data)
  {
         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
         struct intel_engine_execlists * const execlists = &engine->execlists;
-       struct execlist_port *port = execlists->port;
+       struct execlist_port * const port = execlists->port;
         struct drm_i915_private *dev_priv = engine->i915;
  
         /* We can skip acquiring intel_runtime_pm_get() here as it was taken
@@ -780,6 +854,23 @@ static void intel_lrc_irq_handler(unsigned long data)
                         if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
                                 continue;
  
+                       if (status & GEN8_CTX_STATUS_ACTIVE_IDLE &&
+                           buf[2*head + 1] == PREEMPT_ID) {
+                               execlist_cancel_port_requests(execlists);
+
+                               spin_lock_irq(&engine->timeline->lock);
+                               unwind_incomplete_requests(engine);
+                               spin_unlock_irq(&engine->timeline->lock);
+
+                               GEM_BUG_ON(!execlists->preempt);
+                               execlists->preempt = false;
+                               continue;
+                       }
+
+                       if (status & GEN8_CTX_STATUS_PREEMPTED &&
+                           execlists->preempt)
+                               continue;
+
                         /* Check the context/desc id for this event matches */
                         GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
  
@@ -811,7 +902,7 @@ static void intel_lrc_irq_handler(unsigned long data)
                 }
         }
  
-       if (execlists_elsp_ready(engine))
+       if (!execlists->preempt)
                 execlists_dequeue(engine);
  
         intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
@@ -824,7 +915,7 @@ static void insert_request(struct intel_engine_cs *engine,
         struct i915_priolist *p = lookup_priolist(engine, pt, prio);
  
         list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
-       if (ptr_unmask_bits(p, 1) && execlists_elsp_ready(engine))
+       if (ptr_unmask_bits(p, 1))
                 tasklet_hi_schedule(&engine->execlists.irq_tasklet);
  }
  
@@ -954,8 +1045,6 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
         }
  
         spin_unlock_irq(&engine->timeline->lock);
-
-       /* XXX Do we need to preempt to make room for us and our deps? */
  }
  
  static struct intel_ring *
@@ -1151,6 +1240,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
                                        i915_ggtt_offset(engine->scratch) +
                                        2 * CACHELINE_BYTES);
  
+       *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
         /* Pad to end of cacheline */
         while ((unsigned long)batch % CACHELINE_BYTES)
                 *batch++ = MI_NOOP;
@@ -1166,6 +1257,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  
  static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  {
+       *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
  
@@ -1211,6 +1304,8 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
                 *batch++ = 0;
         }
  
+       *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
         /* Pad to end of cacheline */
         while ((unsigned long)batch % CACHELINE_BYTES)
                 *batch++ = MI_NOOP;
@@ -1364,6 +1459,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
                    GT_CONTEXT_SWITCH_INTERRUPT << engine->irq_shift);
         clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
         execlists->csb_head = -1;
+       execlists->preempt = false;
  
         /* After a GPU reset, we may have requests to replay */
         if (!i915_modparams.enable_guc_submission && execlists->first)
@@ -1659,7 +1755,8 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
   */
  static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs)
  {
-       *cs++ = MI_NOOP;
+       /* Ensure there's always at least one preemption point per-request. */
+       *cs++ = MI_ARB_CHECK;
         *cs++ = MI_NOOP;
         request->wa_tail = intel_ring_offset(request, cs);
  }
@@ -1680,7 +1777,6 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
  
         gen8_emit_wa_tail(request, cs);
  }
-
  static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
  
  static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
@@ -1708,7 +1804,6 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
  
         gen8_emit_wa_tail(request, cs);
  }
-
  static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
  
  static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h

index 56d7ae9f298b978b240224af3e6786ed1b1e4261..0fedda17488c2074db40fa81fe8566545bdcc445 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -238,6 +238,11 @@ struct intel_engine_execlists {
  #define EXECLIST_MAX_PORTS 2
         } port[EXECLIST_MAX_PORTS];
  
+       /**
+        * @preempt: are we currently handling a preempting context switch?
+        */
+       bool preempt;
+
         /**
          * @port_mask: number of execlist ports - 1
          */
author	Chris Wilson <chris@chris-wilson.co.uk>
	Tue, 3 Oct 2017 20:34:52 +0000 (21:34 +0100)
committer	Chris Wilson <chris@chris-wilson.co.uk>
	Wed, 4 Oct 2017 16:52:46 +0000 (17:52 +0100)
drivers/gpu/drm/i915/i915_drv.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_irq.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_pci.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_lrc.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_ringbuffer.h		patch \| blob \| history