liquidio: optimize DMA in NUMA systems
authorVSR Burru <veerasenareddy.burru@cavium.com>
Fri, 10 Mar 2017 01:03:24 +0000 (17:03 -0800)
committerDavid S. Miller <davem@davemloft.net>
Mon, 13 Mar 2017 06:17:00 +0000 (23:17 -0700)
Optimize DMA in NUMA systems by allocating memory from NUMA node that NIC
is plugged in to; DMA will no longer cross NUMA nodes.  If NIC IRQs are
pinned to a local CPU, that CPU's access to the DMA'd data is also
optimized.

Signed-off-by: VSR Burru <veerasenareddy.burru@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@cavium.com>
Signed-off-by: Satanand Burla <satananda.burla@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/cavium/liquidio/lio_main.c
drivers/net/ethernet/cavium/liquidio/octeon_device.c
drivers/net/ethernet/cavium/liquidio/octeon_droq.c
drivers/net/ethernet/cavium/liquidio/octeon_iq.h
drivers/net/ethernet/cavium/liquidio/request_manager.c

index dffed432d58e7ad48cd2616e41d1fcb7bed89be4..acfd848d534488e7213933c79fca3e559c750522 100644 (file)
@@ -782,7 +782,7 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
        }
 
        for (i = 0; i < num_iqs; i++) {
-               int numa_node = cpu_to_node(i % num_online_cpus());
+               int numa_node = dev_to_node(&oct->pci_dev->dev);
 
                spin_lock_init(&lio->glist_lock[i]);
 
index 9675ffbf25e6bd9bf34d346204f1b0fbcbcfa185..e21b477d0159f1e17570259e2db3bcb155378728 100644 (file)
@@ -793,7 +793,7 @@ int octeon_setup_instr_queues(struct octeon_device *oct)
        u32 num_descs = 0;
        u32 iq_no = 0;
        union oct_txpciq txpciq;
-       int numa_node = cpu_to_node(iq_no % num_online_cpus());
+       int numa_node = dev_to_node(&oct->pci_dev->dev);
 
        if (OCTEON_CN6XXX(oct))
                num_descs =
@@ -837,7 +837,7 @@ int octeon_setup_output_queues(struct octeon_device *oct)
        u32 num_descs = 0;
        u32 desc_size = 0;
        u32 oq_no = 0;
-       int numa_node = cpu_to_node(oq_no % num_online_cpus());
+       int numa_node = dev_to_node(&oct->pci_dev->dev);
 
        if (OCTEON_CN6XXX(oct)) {
                num_descs =
index 0be87d119a979ea70117e13b2213987460da2a81..a91835da1accf64665e33924dd6da47e8d4a4f0f 100644 (file)
@@ -234,8 +234,7 @@ int octeon_init_droq(struct octeon_device *oct,
        struct octeon_droq *droq;
        u32 desc_ring_size = 0, c_num_descs = 0, c_buf_size = 0;
        u32 c_pkts_per_intr = 0, c_refill_threshold = 0;
-       int orig_node = dev_to_node(&oct->pci_dev->dev);
-       int numa_node = cpu_to_node(q_no % num_online_cpus());
+       int numa_node = dev_to_node(&oct->pci_dev->dev);
 
        dev_dbg(&oct->pci_dev->dev, "%s[%d]\n", __func__, q_no);
 
@@ -275,13 +274,8 @@ int octeon_init_droq(struct octeon_device *oct,
        droq->buffer_size = c_buf_size;
 
        desc_ring_size = droq->max_count * OCT_DROQ_DESC_SIZE;
-       set_dev_node(&oct->pci_dev->dev, numa_node);
        droq->desc_ring = lio_dma_alloc(oct, desc_ring_size,
                                        (dma_addr_t *)&droq->desc_ring_dma);
-       set_dev_node(&oct->pci_dev->dev, orig_node);
-       if (!droq->desc_ring)
-               droq->desc_ring = lio_dma_alloc(oct, desc_ring_size,
-                                       (dma_addr_t *)&droq->desc_ring_dma);
 
        if (!droq->desc_ring) {
                dev_err(&oct->pci_dev->dev,
@@ -983,7 +977,7 @@ int octeon_create_droq(struct octeon_device *oct,
                       u32 desc_size, void *app_ctx)
 {
        struct octeon_droq *droq;
-       int numa_node = cpu_to_node(q_no % num_online_cpus());
+       int numa_node = dev_to_node(&oct->pci_dev->dev);
 
        if (oct->droq[q_no]) {
                dev_dbg(&oct->pci_dev->dev, "Droq already in use. Cannot create droq %d again\n",
index 4608a5af35a3204b54378dc03eef94c976370ac1..5063a12613e53646b9930f10090eae13c32301e3 100644 (file)
@@ -152,7 +152,7 @@ struct octeon_instr_queue {
        struct oct_iq_stats stats;
 
        /** DMA mapped base address of the input descriptor ring. */
-       u64 base_addr_dma;
+       dma_addr_t base_addr_dma;
 
        /** Application context */
        void *app_ctx;
index 707bc15adec61351c1384b8454c85a87a2c4b437..261f448f9de23d059d0e5a5f1552bef5069e4c65 100644 (file)
@@ -62,8 +62,7 @@ int octeon_init_instr_queue(struct octeon_device *oct,
        u32 iq_no = (u32)txpciq.s.q_no;
        u32 q_size;
        struct cavium_wq *db_wq;
-       int orig_node = dev_to_node(&oct->pci_dev->dev);
-       int numa_node = cpu_to_node(iq_no % num_online_cpus());
+       int numa_node = dev_to_node(&oct->pci_dev->dev);
 
        if (OCTEON_CN6XXX(oct))
                conf = &(CFG_GET_IQ_CFG(CHIP_CONF(oct, cn6xxx)));
@@ -91,13 +90,7 @@ int octeon_init_instr_queue(struct octeon_device *oct,
 
        iq->oct_dev = oct;
 
-       set_dev_node(&oct->pci_dev->dev, numa_node);
-       iq->base_addr = lio_dma_alloc(oct, q_size,
-                                     (dma_addr_t *)&iq->base_addr_dma);
-       set_dev_node(&oct->pci_dev->dev, orig_node);
-       if (!iq->base_addr)
-               iq->base_addr = lio_dma_alloc(oct, q_size,
-                                             (dma_addr_t *)&iq->base_addr_dma);
+       iq->base_addr = lio_dma_alloc(oct, q_size, &iq->base_addr_dma);
        if (!iq->base_addr) {
                dev_err(&oct->pci_dev->dev, "Cannot allocate memory for instr queue %d\n",
                        iq_no);
@@ -211,7 +204,7 @@ int octeon_setup_iq(struct octeon_device *oct,
                    void *app_ctx)
 {
        u32 iq_no = (u32)txpciq.s.q_no;
-       int numa_node = cpu_to_node(iq_no % num_online_cpus());
+       int numa_node = dev_to_node(&oct->pci_dev->dev);
 
        if (oct->instr_queue[iq_no]) {
                dev_dbg(&oct->pci_dev->dev, "IQ is in use. Cannot create the IQ: %d again\n",