blk-mq: ensure that hardware queues are always run on the mapped CPUs
authorJens Axboe <axboe@fb.com>
Wed, 9 Apr 2014 16:18:23 +0000 (10:18 -0600)
committerJens Axboe <axboe@fb.com>
Wed, 9 Apr 2014 16:18:23 +0000 (10:18 -0600)
Instead of providing soft mappings with no guarantees on hardware
queues always being run on the right CPU, switch to a hard mapping
guarantee that ensure that we always run the hardware queue on
(one of, if more) the mapped CPU.

Signed-off-by: Jens Axboe <axboe@fb.com>
block/blk-mq.c
include/linux/blk-mq.h

index 9c8f1f4ada7fe059b537251215bb2df1d2ae9ce3..5455ed19de1c5aa82c46d2f4ab434f2cf24e746a 100644 (file)
@@ -209,11 +209,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
                        break;
                }
 
-               blk_mq_put_ctx(ctx);
-               if (!(gfp & __GFP_WAIT))
+               if (gfp & __GFP_WAIT) {
+                       __blk_mq_run_hw_queue(hctx);
+                       blk_mq_put_ctx(ctx);
+               } else {
+                       blk_mq_put_ctx(ctx);
                        break;
+               }
 
-               __blk_mq_run_hw_queue(hctx);
                blk_mq_wait_for_tags(hctx->tags);
        } while (1);
 
@@ -514,6 +517,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        LIST_HEAD(rq_list);
        int bit, queued;
 
+       WARN_ON(!preempt_count());
+
        if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                return;
 
@@ -606,10 +611,22 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
        if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                return;
 
-       if (!async)
+       if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
                __blk_mq_run_hw_queue(hctx);
-       else
+       else if (hctx->queue->nr_hw_queues == 1)
                kblockd_schedule_delayed_work(&hctx->delayed_work, 0);
+       else {
+               unsigned int cpu;
+
+               /*
+                * It'd be great if the workqueue API had a way to pass
+                * in a mask and had some smarts for more clever placement
+                * than the first CPU. Or we could round-robin here. For now,
+                * just queue on the first CPU.
+                */
+               cpu = cpumask_first(hctx->cpumask);
+               kblockd_schedule_delayed_work_on(cpu, &hctx->delayed_work, 0);
+       }
 }
 
 void blk_mq_run_queues(struct request_queue *q, bool async)
@@ -623,7 +640,9 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
                    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
                        continue;
 
+               preempt_disable();
                blk_mq_run_hw_queue(hctx, async);
+               preempt_enable();
        }
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
@@ -648,7 +667,10 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+
+       preempt_disable();
        __blk_mq_run_hw_queue(hctx);
+       preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
@@ -662,7 +684,9 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q)
                        continue;
 
                clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+               preempt_disable();
                blk_mq_run_hw_queue(hctx, true);
+               preempt_enable();
        }
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@ -672,7 +696,10 @@ static void blk_mq_work_fn(struct work_struct *work)
        struct blk_mq_hw_ctx *hctx;
 
        hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
+
+       preempt_disable();
        __blk_mq_run_hw_queue(hctx);
+       preempt_enable();
 }
 
 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
@@ -716,10 +743,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
                spin_unlock(&ctx->lock);
        }
 
-       blk_mq_put_ctx(current_ctx);
-
        if (run_queue)
                blk_mq_run_hw_queue(hctx, async);
+
+       blk_mq_put_ctx(current_ctx);
 }
 
 static void blk_mq_insert_requests(struct request_queue *q,
@@ -755,9 +782,8 @@ static void blk_mq_insert_requests(struct request_queue *q,
        }
        spin_unlock(&ctx->lock);
 
-       blk_mq_put_ctx(current_ctx);
-
        blk_mq_run_hw_queue(hctx, from_schedule);
+       blk_mq_put_ctx(current_ctx);
 }
 
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -876,7 +902,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
        if (unlikely(is_flush_fua)) {
                blk_mq_bio_to_request(rq, bio);
-               blk_mq_put_ctx(ctx);
                blk_insert_flush(rq);
                goto run_queue;
        }
@@ -914,7 +939,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
        }
 
        spin_unlock(&ctx->lock);
-       blk_mq_put_ctx(ctx);
 
        /*
         * For a SYNC request, send it to the hardware immediately. For an
@@ -923,6 +947,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
         */
 run_queue:
        blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
+       blk_mq_put_ctx(ctx);
 }
 
 /*
@@ -990,9 +1015,9 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
        blk_mq_hctx_mark_pending(hctx, ctx);
 
        spin_unlock(&ctx->lock);
-       blk_mq_put_ctx(ctx);
 
        blk_mq_run_hw_queue(hctx, true);
+       blk_mq_put_ctx(ctx);
 }
 
 static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
@@ -1255,12 +1280,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                __ctx->queue = q;
 
                /* If the cpu isn't online, the cpu is mapped to first hctx */
-               hctx = q->mq_ops->map_queue(q, i);
-               hctx->nr_ctx++;
-
                if (!cpu_online(i))
                        continue;
 
+               hctx = q->mq_ops->map_queue(q, i);
+               cpumask_set_cpu(i, hctx->cpumask);
+               hctx->nr_ctx++;
+
                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
@@ -1277,6 +1303,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
        struct blk_mq_ctx *ctx;
 
        queue_for_each_hw_ctx(q, hctx, i) {
+               cpumask_clear(hctx->cpumask);
                hctx->nr_ctx = 0;
        }
 
@@ -1285,7 +1312,11 @@ static void blk_mq_map_swqueue(struct request_queue *q)
         */
        queue_for_each_ctx(q, ctx, i) {
                /* If the cpu isn't online, the cpu is mapped to first hctx */
+               if (!cpu_online(i))
+                       continue;
+
                hctx = q->mq_ops->map_queue(q, i);
+               cpumask_set_cpu(i, hctx->cpumask);
                ctx->index_hw = hctx->nr_ctx;
                hctx->ctxs[hctx->nr_ctx++] = ctx;
        }
@@ -1329,6 +1360,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
                if (!hctxs[i])
                        goto err_hctxs;
 
+               if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
+                       goto err_hctxs;
+
                hctxs[i]->numa_node = NUMA_NO_NODE;
                hctxs[i]->queue_num = i;
        }
@@ -1392,6 +1426,7 @@ err_hctxs:
        for (i = 0; i < reg->nr_hw_queues; i++) {
                if (!hctxs[i])
                        break;
+               free_cpumask_var(hctxs[i]->cpumask);
                reg->ops->free_hctx(hctxs[i], i);
        }
        kfree(hctxs);
@@ -1413,6 +1448,7 @@ void blk_mq_free_queue(struct request_queue *q)
                blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
                if (q->mq_ops->exit_hctx)
                        q->mq_ops->exit_hctx(hctx, i);
+               free_cpumask_var(hctx->cpumask);
                q->mq_ops->free_hctx(hctx, i);
        }
 
index 0120451545d8d2b85cc5eb10475359442f9d7460..b6ee48740458232d9e0ce56edffd0195e833ab4d 100644 (file)
@@ -19,6 +19,7 @@ struct blk_mq_hw_ctx {
 
        unsigned long           state;          /* BLK_MQ_S_* flags */
        struct delayed_work     delayed_work;
+       cpumask_var_t           cpumask;
 
        unsigned long           flags;          /* BLK_MQ_F_* flags */