mlx5: use page_pool for xdp_return_frame call
authorJesper Dangaard Brouer <brouer@redhat.com>
Tue, 17 Apr 2018 14:46:27 +0000 (16:46 +0200)
committerDavid S. Miller <davem@davemloft.net>
Tue, 17 Apr 2018 14:50:29 +0000 (10:50 -0400)
This patch shows how it is possible to have both the driver local page
cache, which uses elevated refcnt for "catching"/avoiding SKB
put_page returns the page through the page allocator.  And at the
same time, have pages getting returned to the page_pool from
ndp_xdp_xmit DMA completion.

The performance improvement for XDP_REDIRECT in this patch is really
good.  Especially considering that (currently) the xdp_return_frame
API and page_pool_put_page() does per frame operations of both
rhashtable ID-lookup and locked return into (page_pool) ptr_ring.
(It is the plan to remove these per frame operation in a followup
patchset).

The benchmark performed was RX on mlx5 and XDP_REDIRECT out ixgbe,
with xdp_redirect_map (using devmap) . And the target/maximum
capability of ixgbe is 13Mpps (on this HW setup).

Before this patch for mlx5, XDP redirected frames were returned via
the page allocator.  The single flow performance was 6Mpps, and if I
started two flows the collective performance drop to 4Mpps, because we
hit the page allocator lock (further negative scaling occurs).

Two test scenarios need to be covered, for xdp_return_frame API, which
is DMA-TX completion running on same-CPU or cross-CPU free/return.
Results were same-CPU=10Mpps, and cross-CPU=12Mpps.  This is very
close to our 13Mpps max target.

The reason max target isn't reached in cross-CPU test, is likely due
to RX-ring DMA unmap/map overhead (which doesn't occur in ixgbe to
ixgbe testing).  It is also planned to remove this unnecessary DMA
unmap in a later patchset

V2: Adjustments requested by Tariq
 - Changed page_pool_create return codes not return NULL, only
   ERR_PTR, as this simplifies err handling in drivers.
 - Save a branch in mlx5e_page_release
 - Correct page_pool size calc for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

V5: Updated patch desc

V8: Adjust for b0cedc844c00 ("net/mlx5e: Remove rq_headroom field from params")
V9:
 - Adjust for 121e89275471 ("net/mlx5e: Refactor RQ XDP_TX indication")
 - Adjust for 73281b78a37a ("net/mlx5e: Derive Striding RQ size from MTU")
 - Correct handling if page_pool_create fail for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

V10: Req from Tariq
 - Change pool_size calc for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

index 1a05d1072c5ef851bd5b0b48d7e0852faffbe8c6..3317a4da87cbab3a82fab214e808c9f2c6ad563a 100644 (file)
@@ -53,6 +53,8 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 
+struct page_pool;
+
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
 #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
@@ -534,6 +536,7 @@ struct mlx5e_rq {
        unsigned int           hw_mtu;
        struct mlx5e_xdpsq     xdpsq;
        DECLARE_BITMAP(flags, 8);
+       struct page_pool      *page_pool;
 
        /* control */
        struct mlx5_wq_ctrl    wq_ctrl;
index 2dca0933dfd3d5065b3441e2fd51d63bf7ce668c..f100374199784a2f0892dd0a4e4ff31aa8ec9d73 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/mlx5/fs.h>
 #include <net/vxlan.h>
 #include <linux/bpf.h>
+#include <net/page_pool.h>
 #include "eswitch.h"
 #include "en.h"
 #include "en_tc.h"
@@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                          struct mlx5e_rq_param *rqp,
                          struct mlx5e_rq *rq)
 {
+       struct page_pool_params pp_params = { 0 };
        struct mlx5_core_dev *mdev = c->mdev;
        void *rqc = rqp->rqc;
        void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
-       u32 byte_count;
+       u32 byte_count, pool_size;
        int npages;
        int wq_sz;
        int err;
@@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
        rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
        rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
+       pool_size = 1 << params->log_rq_mtu_frames;
 
        switch (rq->wq_type) {
        case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+
+               pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
                rq->post_wqes = mlx5e_post_rx_mpwqes;
                rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
 
@@ -512,13 +517,31 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
                rq->mkey_be = c->mkey_be;
        }
 
-       /* This must only be activate for order-0 pages */
-       if (rq->xdp_prog) {
-               err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
-                                                MEM_TYPE_PAGE_ORDER0, NULL);
-               if (err)
-                       goto err_rq_wq_destroy;
+       /* Create a page_pool and register it with rxq */
+       pp_params.order     = rq->buff.page_order;
+       pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
+       pp_params.pool_size = pool_size;
+       pp_params.nid       = cpu_to_node(c->cpu);
+       pp_params.dev       = c->pdev;
+       pp_params.dma_dir   = rq->buff.map_dir;
+
+       /* page_pool can be used even when there is no rq->xdp_prog,
+        * given page_pool does not handle DMA mapping there is no
+        * required state to clear. And page_pool gracefully handle
+        * elevated refcnt.
+        */
+       rq->page_pool = page_pool_create(&pp_params);
+       if (IS_ERR(rq->page_pool)) {
+               if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
+                       kfree(rq->wqe.frag_info);
+               err = PTR_ERR(rq->page_pool);
+               rq->page_pool = NULL;
+               goto err_rq_wq_destroy;
        }
+       err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+                                        MEM_TYPE_PAGE_POOL, rq->page_pool);
+       if (err)
+               goto err_rq_wq_destroy;
 
        for (i = 0; i < wq_sz; i++) {
                struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
@@ -556,6 +579,8 @@ err_rq_wq_destroy:
        if (rq->xdp_prog)
                bpf_prog_put(rq->xdp_prog);
        xdp_rxq_info_unreg(&rq->xdp_rxq);
+       if (rq->page_pool)
+               page_pool_destroy(rq->page_pool);
        mlx5_wq_destroy(&rq->wq_ctrl);
 
        return err;
@@ -569,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
                bpf_prog_put(rq->xdp_prog);
 
        xdp_rxq_info_unreg(&rq->xdp_rxq);
+       if (rq->page_pool)
+               page_pool_destroy(rq->page_pool);
 
        switch (rq->wq_type) {
        case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
index 0e24be05907febe358c192c175fda45d69a5e0cf..f42436d7f2d94d534628004da2356a8dd1190887 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/bpf_trace.h>
 #include <net/busy_poll.h>
 #include <net/ip6_checksum.h>
+#include <net/page_pool.h>
 #include "en.h"
 #include "en_tc.h"
 #include "eswitch.h"
@@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
        if (mlx5e_rx_cache_get(rq, dma_info))
                return 0;
 
-       dma_info->page = dev_alloc_pages(rq->buff.page_order);
+       dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
        if (unlikely(!dma_info->page))
                return -ENOMEM;
 
@@ -246,11 +247,16 @@ static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
                        bool recycle)
 {
-       if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
-               return;
+       if (likely(recycle)) {
+               if (mlx5e_rx_cache_put(rq, dma_info))
+                       return;
 
-       mlx5e_page_dma_unmap(rq, dma_info);
-       put_page(dma_info->page);
+               mlx5e_page_dma_unmap(rq, dma_info);
+               page_pool_recycle_direct(rq->page_pool, dma_info->page);
+       } else {
+               mlx5e_page_dma_unmap(rq, dma_info);
+               put_page(dma_info->page);
+       }
 }
 
 static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,