drm/amdgpu: optionally do a writeback but don't invalidate TC for IB fences
authorMarek Olšák <marek.olsak@amd.com>
Tue, 3 Apr 2018 17:05:03 +0000 (13:05 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 15 May 2018 18:43:32 +0000 (13:43 -0500)
There is a new IB flag that enables this new behavior.
Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense
when draw calls from two adjacent gfx IBs run in parallel. This will be
the new default for Mesa.

v2: bump the version

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/soc15d.h
include/uapi/drm/amdgpu_drm.h

index 5c0567ad1ba71bc82b4b0ddbff6af773f1f5cd81..7c17a0bc2cd2af27f0def8cb9ce47542467c00ee 100644 (file)
  * - 3.23.0 - Add query for VRAM lost counter
  * - 3.24.0 - Add high priority compute support for gfx9
  * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
+ * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
  */
 #define KMS_DRIVER_MAJOR       3
-#define KMS_DRIVER_MINOR       25
+#define KMS_DRIVER_MINOR       26
 #define KMS_DRIVER_PATCHLEVEL  0
 
 int amdgpu_vram_limit = 0;
index 97449e06a242463dc7a71ca65d63661df2dcde97..d09fcab2398ff05ec58772e364fb0e6b19d2bede 100644 (file)
@@ -131,7 +131,8 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
  * Emits a fence command on the requested ring (all asics).
  * Returns 0 on success, -ENOMEM on failure.
  */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+                     unsigned flags)
 {
        struct amdgpu_device *adev = ring->adev;
        struct amdgpu_fence *fence;
@@ -149,7 +150,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
                       adev->fence_context + ring->idx,
                       seq);
        amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
-                              seq, AMDGPU_FENCE_FLAG_INT);
+                              seq, flags | AMDGPU_FENCE_FLAG_INT);
 
        ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
        /* This function can't be called concurrently anyway, otherwise
index 311589e02d1755f2b0291c95684b0423ff18173e..f70eeed9ed76fa893dabe2218c4c85c4b4aec104 100644 (file)
@@ -127,6 +127,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        struct amdgpu_vm *vm;
        uint64_t fence_ctx;
        uint32_t status = 0, alloc_size;
+       unsigned fence_flags = 0;
 
        unsigned i;
        int r = 0;
@@ -227,7 +228,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 #endif
                amdgpu_asic_invalidate_hdp(adev, ring);
 
-       r = amdgpu_fence_emit(ring, f);
+       if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE)
+               fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+
+       r = amdgpu_fence_emit(ring, f, fence_flags);
        if (r) {
                dev_err(adev->dev, "failed to emit fence (%d)\n", r);
                if (job && job->vmid)
@@ -242,7 +246,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
        /* wrap the last IB with fence */
        if (job && job->uf_addr) {
                amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence,
-                                      AMDGPU_FENCE_FLAG_64BIT);
+                                      fence_flags | AMDGPU_FENCE_FLAG_64BIT);
        }
 
        if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
index 08fcdf6f7b5311a9d087d0da69f1eb7adf98588d..4f8dac2d36a5f93e548c9b2a165193bf86fb49b0 100644 (file)
@@ -42,6 +42,7 @@
 
 #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
 #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
+#define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
 
 enum amdgpu_ring_type {
        AMDGPU_RING_TYPE_GFX,
@@ -90,7 +91,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
                                   unsigned irq_type);
 void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
 void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence);
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
+                     unsigned flags);
 int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
 void amdgpu_fence_process(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
index 9ec7c1041df2e9b465a2584d6455692ac18bbef2..9c2195a2896d161e7032b5ca91ce5c2ea311477d 100644 (file)
@@ -633,7 +633,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
                amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
 
        if (vm_flush_needed || pasid_mapping_needed) {
-               r = amdgpu_fence_emit(ring, &fence);
+               r = amdgpu_fence_emit(ring, &fence, 0);
                if (r)
                        return r;
        }
index 6a19e0311a9c471193b65bbed052f9c449a24eaf..05b2d34110b76e634db3fbc6c8d5f801bf690443 100644 (file)
@@ -3775,13 +3775,16 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
 {
        bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
        bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
+       bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
 
        /* RELEASE_MEM - flush caches, send int */
        amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
-       amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
-                                EOP_TC_ACTION_EN |
-                                EOP_TC_WB_ACTION_EN |
-                                EOP_TC_MD_ACTION_EN |
+       amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
+                                              EOP_TC_NC_ACTION_EN) :
+                                             (EOP_TCL1_ACTION_EN |
+                                              EOP_TC_ACTION_EN |
+                                              EOP_TC_WB_ACTION_EN |
+                                              EOP_TC_MD_ACTION_EN)) |
                                 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
                                 EVENT_INDEX(5)));
        amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
index 7f408f85fdb631ee7bccda52ec444a7ec87942f9..839a144c1645e20066fe1095df962cfa23b44c7b 100644 (file)
 #define                EOP_TC_WB_ACTION_EN                     (1 << 15) /* L2 */
 #define                EOP_TCL1_ACTION_EN                      (1 << 16)
 #define                EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
+#define                EOP_TC_NC_ACTION_EN                     (1 << 19)
 #define                EOP_TC_MD_ACTION_EN                     (1 << 21) /* L2 metadata */
 
 #define                DATA_SEL(x)                             ((x) << 29)
index b193e95f1f24adec5c5384863e36f29d020c4e58..78fe828f2f79cb4f864181739ee79463197e2963 100644 (file)
@@ -526,6 +526,10 @@ union drm_amdgpu_cs {
 /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
 #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
 
+/* The IB fence should do the L2 writeback but not invalidate any shader
+ * caches (L2/vL1/sL1/I$). */
+#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
+
 struct drm_amdgpu_cs_chunk_ib {
        __u32 _pad;
        /** AMDGPU_IB_FLAG_* */