drm/amdgpu: use ring/hash for fault handling on GMC9 v3

author Christian König <christian.koenig@amd.com>

Wed, 7 Nov 2018 12:55:01 +0000 (13:55 +0100)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 19 Mar 2019 20:36:58 +0000 (15:36 -0500)
author Christian König <christian.koenig@amd.com>
Wed, 7 Nov 2018 12:55:01 +0000 (13:55 +0100)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 19 Mar 2019 20:36:58 +0000 (15:36 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

index 5a32a0d2ad31ad6fdb894d233db3ec11c201745c..250d9212cc389e4830b08b8ed0eb1e937d21398d 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -240,3 +240,58 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
         dev_info(adev->dev, "AGP: %lluM 0x%016llX - 0x%016llX\n",
                         mc->agp_size >> 20, mc->agp_start, mc->agp_end);
  }
+
+/**
+ * amdgpu_gmc_filter_faults - filter VM faults
+ *
+ * @adev: amdgpu device structure
+ * @addr: address of the VM fault
+ * @pasid: PASID of the process causing the fault
+ * @timestamp: timestamp of the fault
+ *
+ * Returns:
+ * True if the fault was filtered and should not be processed further.
+ * False if the fault is a new one and needs to be handled.
+ */
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+                             uint16_t pasid, uint64_t timestamp)
+{
+       struct amdgpu_gmc *gmc = &adev->gmc;
+
+       uint64_t stamp, key = addr << 4 | pasid;
+       struct amdgpu_gmc_fault *fault;
+       uint32_t hash;
+
+       /* If we don't have space left in the ring buffer return immediately */
+       stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
+               AMDGPU_GMC_FAULT_TIMEOUT;
+       if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
+               return true;
+
+       /* Try to find the fault in the hash */
+       hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
+       fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
+       while (fault->timestamp >= stamp) {
+               uint64_t tmp;
+
+               if (fault->key == key)
+                       return true;
+
+               tmp = fault->timestamp;
+               fault = &gmc->fault_ring[fault->next];
+
+               /* Check if the entry was reused */
+               if (fault->timestamp >= tmp)
+                       break;
+       }
+
+       /* Add the fault to the ring */
+       fault = &gmc->fault_ring[gmc->last_fault];
+       fault->key = key;
+       fault->timestamp = timestamp;
+
+       /* And update the hash */
+       fault->next = gmc->fault_hash[hash].idx;
+       gmc->fault_hash[hash].idx = gmc->last_fault++;
+       return false;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h

index 6ce45664ff872e51652a58651c57dc428936c218..071145ac67b56c2c05c83bc300f07075332c294a 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -43,8 +43,34 @@
   */
  #define AMDGPU_GMC_HOLE_MASK   0x0000ffffffffffffULL
  
+/*
+ * Ring size as power of two for the log of recent faults.
+ */
+#define AMDGPU_GMC_FAULT_RING_ORDER    8
+#define AMDGPU_GMC_FAULT_RING_SIZE     (1 << AMDGPU_GMC_FAULT_RING_ORDER)
+
+/*
+ * Hash size as power of two for the log of recent faults
+ */
+#define AMDGPU_GMC_FAULT_HASH_ORDER    8
+#define AMDGPU_GMC_FAULT_HASH_SIZE     (1 << AMDGPU_GMC_FAULT_HASH_ORDER)
+
+/*
+ * Number of IH timestamp ticks until a fault is considered handled
+ */
+#define AMDGPU_GMC_FAULT_TIMEOUT       5000ULL
+
  struct firmware;
  
+/*
+ * GMC page fault information
+ */
+struct amdgpu_gmc_fault {
+       uint64_t        timestamp;
+       uint64_t        next:AMDGPU_GMC_FAULT_RING_ORDER;
+       uint64_t        key:52;
+};
+
  /*
   * VMHUB structures, functions & helpers
   */
@@ -141,6 +167,12 @@ struct amdgpu_gmc {
         struct kfd_vm_fault_info *vm_fault_info;
         atomic_t                vm_fault_info_updated;
  
+       struct amdgpu_gmc_fault fault_ring[AMDGPU_GMC_FAULT_RING_SIZE];
+       struct {
+               uint64_t        idx:AMDGPU_GMC_FAULT_RING_ORDER;
+       } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
+       uint64_t                last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
+
         const struct amdgpu_gmc_funcs   *gmc_funcs;
  
         struct amdgpu_xgmi xgmi;
@@ -195,5 +227,7 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
                               struct amdgpu_gmc *mc);
  void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
                              struct amdgpu_gmc *mc);
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+                             uint16_t pasid, uint64_t timestamp);
  
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index d373e8b6d5419bd11483ab67857780402023bcdf..84a4b31246ce7f89b9c7ac997e66d36a7cab00e7 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -301,62 +301,6 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct amdgpu_device *adev,
         return 0;
  }
  
-/**
- * vega10_ih_prescreen_iv - prescreen an interrupt vector
- *
- * @adev: amdgpu_device pointer
- *
- * Returns true if the interrupt vector should be further processed.
- */
-static bool gmc_v9_0_prescreen_iv(struct amdgpu_device *adev,
-                                 struct amdgpu_iv_entry *entry,
-                                 uint64_t addr)
-{
-       struct amdgpu_vm *vm;
-       u64 key;
-       int r;
-
-       /* No PASID, can't identify faulting process */
-       if (!entry->pasid)
-               return true;
-
-       /* Not a retry fault */
-       if (!(entry->src_data[1] & 0x80))
-               return true;
-
-       /* Track retry faults in per-VM fault FIFO. */
-       spin_lock(&adev->vm_manager.pasid_lock);
-       vm = idr_find(&adev->vm_manager.pasid_idr, entry->pasid);
-       if (!vm) {
-               /* VM not found, process it normally */
-               spin_unlock(&adev->vm_manager.pasid_lock);
-               return true;
-       }
-
-       key = AMDGPU_VM_FAULT(entry->pasid, addr);
-       r = amdgpu_vm_add_fault(vm->fault_hash, key);
-
-       /* Hash table is full or the fault is already being processed,
-        * ignore further page faults
-        */
-       if (r != 0) {
-               spin_unlock(&adev->vm_manager.pasid_lock);
-               return false;
-       }
-       /* No locking required with single writer and single reader */
-       r = kfifo_put(&vm->faults, key);
-       if (!r) {
-               /* FIFO is full. Ignore it until there is space */
-               amdgpu_vm_clear_fault(vm->fault_hash, key);
-               spin_unlock(&adev->vm_manager.pasid_lock);
-               return false;
-       }
-
-       spin_unlock(&adev->vm_manager.pasid_lock);
-       /* It's the first fault for this address, process it normally */
-       return true;
-}
-
  static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
@@ -369,9 +313,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
         addr = (u64)entry->src_data[0] << 12;
         addr |= ((u64)entry->src_data[1] & 0xf) << 44;
  
-       if (!gmc_v9_0_prescreen_iv(adev, entry, addr))
+       if (retry_fault && amdgpu_gmc_filter_faults(adev, addr, entry->pasid,
+                                                   entry->timestamp))
                 return 1; /* This also prevents sending it to KFD */
  
+       /* If it's the first fault for this address, process it normally */
         if (!amdgpu_sriov_vf(adev)) {
                 status = RREG32(hub->vm_l2_pro_fault_status);
                 WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
author	Christian König <christian.koenig@amd.com>
	Wed, 7 Nov 2018 12:55:01 +0000 (13:55 +0100)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 19 Mar 2019 20:36:58 +0000 (15:36 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c		patch \| blob \| history