xdp: tracking page_pool resources and safe removal

author Jesper Dangaard Brouer <brouer@redhat.com>

Tue, 18 Jun 2019 13:05:47 +0000 (15:05 +0200)

committer David S. Miller <davem@davemloft.net>

Wed, 19 Jun 2019 15:23:13 +0000 (11:23 -0400)
author Jesper Dangaard Brouer <brouer@redhat.com>
Tue, 18 Jun 2019 13:05:47 +0000 (15:05 +0200)
committer David S. Miller <davem@davemloft.net>
Wed, 19 Jun 2019 15:23:13 +0000 (11:23 -0400)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c

index 46b6a47bd1e3a9cc8a97fa5826a7ae778d73c3a6..5e40db8f92e615266fdeff923190472ec244189f 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -643,9 +643,6 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
         }
  
         xdp_rxq_info_unreg(&rq->xdp_rxq);
-       if (rq->page_pool)
-               page_pool_destroy(rq->page_pool);
-
         mlx5_wq_destroy(&rq->wq_ctrl);
  }
  
diff --git a/include/net/page_pool.h b/include/net/page_pool.h

index 754d980700dff1d963198a0aaa86dc20c48e2663..f09b3f1994e603a6ef871fd35299a21e9f20160c 100644 (file)
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -16,14 +16,16 @@
   * page_pool_alloc_pages() call.  Drivers should likely use
   * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
   *
- * If page_pool handles DMA mapping (use page->private), then API user
- * is responsible for invoking page_pool_put_page() once.  In-case of
- * elevated refcnt, the DMA state is released, assuming other users of
- * the page will eventually call put_page().
+ * API keeps track of in-flight pages, in-order to let API user know
+ * when it is safe to dealloactor page_pool object.  Thus, API users
+ * must make sure to call page_pool_release_page() when a page is
+ * "leaving" the page_pool.  Or call page_pool_put_page() where
+ * appropiate.  For maintaining correct accounting.
   *
- * If no DMA mapping is done, then it can act as shim-layer that
- * fall-through to alloc_page.  As no state is kept on the page, the
- * regular put_page() call is sufficient.
+ * API user must only call page_pool_put_page() once on a page, as it
+ * will either recycle the page, or in case of elevated refcnt, it
+ * will release the DMA mapping and in-flight state accounting.  We
+ * hope to lift this requirement in the future.
   */
  #ifndef _NET_PAGE_POOL_H
  #define _NET_PAGE_POOL_H
@@ -66,9 +68,10 @@ struct page_pool_params {
  };
  
  struct page_pool {
-       struct rcu_head rcu;
         struct page_pool_params p;
  
+        u32 pages_state_hold_cnt;
+
         /*
          * Data structure for allocation side
          *
@@ -96,6 +99,8 @@ struct page_pool {
          * TODO: Implement bulk return pages into this structure.
          */
         struct ptr_ring ring;
+
+       atomic_t pages_state_release_cnt;
  };
  
  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
@@ -109,8 +114,6 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
  
  struct page_pool *page_pool_create(const struct page_pool_params *params);
  
-void page_pool_destroy(struct page_pool *pool);
-
  void __page_pool_free(struct page_pool *pool);
  static inline void page_pool_free(struct page_pool *pool)
  {
@@ -143,6 +146,24 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
         __page_pool_put_page(pool, page, true);
  }
  
+/* API user MUST have disconnected alloc-side (not allowed to call
+ * page_pool_alloc_pages()) before calling this.  The free-side can
+ * still run concurrently, to handle in-flight packet-pages.
+ *
+ * A request to shutdown can fail (with false) if there are still
+ * in-flight packet-pages.
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool);
+static inline bool page_pool_request_shutdown(struct page_pool *pool)
+{
+       /* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+        * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+        */
+#ifdef CONFIG_PAGE_POOL
+       return __page_pool_request_shutdown(pool);
+#endif
+}
+
  /* Disconnects a page (from a page_pool).  API users can have a need
   * to disconnect a page (from a page_pool), to allow it to be used as
   * a regular page (that will eventually be returned to the normal
diff --git a/net/core/page_pool.c b/net/core/page_pool.c

index 41391b5dc14c9a028c4bbebabf73a7893a6ca956..8679e24fd66574b2cc3ce9f3e27a21fa85c7b2d7 100644 (file)
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -43,6 +43,8 @@ static int page_pool_init(struct page_pool *pool,
         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
                 return -ENOMEM;
  
+       atomic_set(&pool->pages_state_release_cnt, 0);
+
         return 0;
  }
  
@@ -151,6 +153,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
         page->dma_addr = dma;
  
  skip_dma_map:
+       /* Track how many pages are held 'in-flight' */
+       pool->pages_state_hold_cnt++;
+
         /* When page just alloc'ed is should/must have refcnt 1. */
         return page;
  }
@@ -173,6 +178,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
  }
  EXPORT_SYMBOL(page_pool_alloc_pages);
  
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
+ *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
+ */
+#define _distance(a, b)        (s32)((a) - (b))
+
+static s32 page_pool_inflight(struct page_pool *pool)
+{
+       u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+       u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+       s32 distance;
+
+       distance = _distance(hold_cnt, release_cnt);
+
+       /* TODO: Add tracepoint here */
+       return distance;
+}
+
+static bool __page_pool_safe_to_destroy(struct page_pool *pool)
+{
+       s32 inflight = page_pool_inflight(pool);
+
+       /* The distance should not be able to become negative */
+       WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
+
+       return (inflight == 0);
+}
+
  /* Cleanup page_pool state from page */
  static void __page_pool_clean_page(struct page_pool *pool,
                                    struct page *page)
@@ -180,7 +212,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
         dma_addr_t dma;
  
         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
-               return;
+               goto skip_dma_unmap;
  
         dma = page->dma_addr;
         /* DMA unmap */
@@ -188,11 +220,16 @@ static void __page_pool_clean_page(struct page_pool *pool,
                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
                              DMA_ATTR_SKIP_CPU_SYNC);
         page->dma_addr = 0;
+skip_dma_unmap:
+       atomic_inc(&pool->pages_state_release_cnt);
  }
  
  /* unmap the page and clean our state */
  void page_pool_unmap_page(struct page_pool *pool, struct page *page)
  {
+       /* When page is unmapped, this implies page will not be
+        * returned to page_pool.
+        */
         __page_pool_clean_page(pool, page);
  }
  EXPORT_SYMBOL(page_pool_unmap_page);
@@ -201,6 +238,7 @@ EXPORT_SYMBOL(page_pool_unmap_page);
  static void __page_pool_return_page(struct page_pool *pool, struct page *page)
  {
         __page_pool_clean_page(pool, page);
+
         put_page(page);
         /* An optimization would be to call __free_pages(page, pool->p.order)
          * knowing page is not part of page-cache (thus avoiding a
@@ -296,24 +334,17 @@ void __page_pool_free(struct page_pool *pool)
  {
         WARN(pool->alloc.count, "API usage violation");
         WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
+       WARN(!__page_pool_safe_to_destroy(pool), "still in-flight pages");
  
         ptr_ring_cleanup(&pool->ring, NULL);
         kfree(pool);
  }
  EXPORT_SYMBOL(__page_pool_free);
  
-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
-{
-       struct page_pool *pool;
-
-       pool = container_of(rcu, struct page_pool, rcu);
-
-       __page_pool_empty_ring(pool);
-       __page_pool_free(pool);
-}
-
-/* Cleanup and release resources */
-void page_pool_destroy(struct page_pool *pool)
+/* Request to shutdown: release pages cached by page_pool, and check
+ * for in-flight pages
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool)
  {
         struct page *page;
  
@@ -331,7 +362,6 @@ void page_pool_destroy(struct page_pool *pool)
          */
         __page_pool_empty_ring(pool);
  
-       /* An xdp_mem_allocator can still ref page_pool pointer */
-       call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+       return __page_pool_safe_to_destroy(pool);
  }
-EXPORT_SYMBOL(page_pool_destroy);
+EXPORT_SYMBOL(__page_pool_request_shutdown);
diff --git a/net/core/xdp.c b/net/core/xdp.c

index 0fcc32340c4e902a298148685c8e994ed21b95d0..aae665ccee3f1b29e84f94d22f8484b251fd2a01 100644 (file)
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -38,6 +38,7 @@ struct xdp_mem_allocator {
         };
         struct rhash_head node;
         struct rcu_head rcu;
+       struct delayed_work defer_wq;
  };
  
  static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
@@ -79,13 +80,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
  
         xa = container_of(rcu, struct xdp_mem_allocator, rcu);
  
+       /* Allocator have indicated safe to remove before this is called */
+       if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+               page_pool_free(xa->page_pool);
+
         /* Allow this ID to be reused */
         ida_simple_remove(&mem_id_pool, xa->mem.id);
  
-       /* Notice, driver is expected to free the *allocator,
-        * e.g. page_pool, and MUST also use RCU free.
-        */
-
         /* Poison memory */
         xa->mem.id = 0xFFFF;
         xa->mem.type = 0xF0F0;
@@ -94,6 +95,46 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
         kfree(xa);
  }
  
+bool __mem_id_disconnect(int id)
+{
+       struct xdp_mem_allocator *xa;
+       bool safe_to_remove = true;
+
+       mutex_lock(&mem_id_lock);
+
+       xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+       if (!xa) {
+               mutex_unlock(&mem_id_lock);
+               WARN(1, "Request remove non-existing id(%d), driver bug?", id);
+               return true;
+       }
+
+       /* Detects in-flight packet-pages for page_pool */
+       if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+               safe_to_remove = page_pool_request_shutdown(xa->page_pool);
+
+       if (safe_to_remove &&
+           !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+               call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+       mutex_unlock(&mem_id_lock);
+       return safe_to_remove;
+}
+
+#define DEFER_TIME (msecs_to_jiffies(1000))
+
+static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+{
+       struct delayed_work *dwq = to_delayed_work(wq);
+       struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
+
+       if (__mem_id_disconnect(xa->mem.id))
+               return;
+
+       /* Still not ready to be disconnected, retry later */
+       schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+}
+
  void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
  {
         struct xdp_mem_allocator *xa;
@@ -112,16 +153,28 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
         if (id == 0)
                 return;
  
+       if (__mem_id_disconnect(id))
+               return;
+
+       /* Could not disconnect, defer new disconnect attempt to later */
         mutex_lock(&mem_id_lock);
  
         xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
-       if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
-               call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+       if (!xa) {
+               mutex_unlock(&mem_id_lock);
+               return;
+       }
  
+       INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
         mutex_unlock(&mem_id_lock);
+       schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
  }
  EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
  
+/* This unregister operation will also cleanup and destroy the
+ * allocator. The page_pool_free() operation is first called when it's
+ * safe to remove, possibly deferred to a workqueue.
+ */
  void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
  {
         /* Simplify driver cleanup code paths, allow unreg "unused" */
author	Jesper Dangaard Brouer <brouer@redhat.com>
	Tue, 18 Jun 2019 13:05:47 +0000 (15:05 +0200)
committer	David S. Miller <davem@davemloft.net>
	Wed, 19 Jun 2019 15:23:13 +0000 (11:23 -0400)
drivers/net/ethernet/mellanox/mlx5/core/en_main.c		patch \| blob \| history
include/net/page_pool.h		patch \| blob \| history
net/core/page_pool.c		patch \| blob \| history
net/core/xdp.c		patch \| blob \| history