rcu: Parallelize expedited grace-period initialization
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Fri, 2 Feb 2018 06:05:38 +0000 (22:05 -0800)
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Tue, 15 May 2018 17:25:44 +0000 (10:25 -0700)
The latency of RCU expedited grace periods grows with increasing numbers
of CPUs, eventually failing to be all that expedited.  Much of the growth
in latency is in the initialization phase, so this commit uses workqueues
to carry out this initialization concurrently on a rcu_node-by-rcu_node
basis.

This change makes use of a new rcu_par_gp_wq because flushing a work
item from another work item running from the same workqueue can result
in deadlock.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Nicholas Piggin <npiggin@gmail.com>
kernel/rcu/rcu.h
kernel/rcu/tree.c
kernel/rcu/tree.h
kernel/rcu/tree_exp.h

index 7a693e31184a6448a58c5a7c52a878b72291aefd..976019d6fa068f1e5aa9a10fd039c8e07e4d85fb 100644 (file)
@@ -486,6 +486,7 @@ void rcu_force_quiescent_state(void);
 void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
+extern struct workqueue_struct *rcu_par_gp_wq;
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_RCU_NOCB_CPU
index 2a734692a5811089df5552c0a3b65aac76d27439..23781fc9083046981f4442dc37c5244977e9a4cf 100644 (file)
@@ -4168,6 +4168,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
 }
 
 struct workqueue_struct *rcu_gp_wq;
+struct workqueue_struct *rcu_par_gp_wq;
 
 void __init rcu_init(void)
 {
@@ -4199,6 +4200,8 @@ void __init rcu_init(void)
        /* Create workqueue for expedited GPs and for Tree SRCU. */
        rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
        WARN_ON(!rcu_gp_wq);
+       rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+       WARN_ON(!rcu_par_gp_wq);
 }
 
 #include "tree_exp.h"
index f491ab4f2e8ec486919312bc624f34adedfcba10..98d33902b65c9f26fe65308ae62539fd7fcd402c 100644 (file)
@@ -58,6 +58,14 @@ struct rcu_dynticks {
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
 
+/* Communicate arguments to a workqueue handler. */
+struct rcu_exp_work {
+       smp_call_func_t rew_func;
+       struct rcu_state *rew_rsp;
+       unsigned long rew_s;
+       struct work_struct rew_work;
+};
+
 /* RCU's kthread states for tracing. */
 #define RCU_KTHREAD_STOPPED  0
 #define RCU_KTHREAD_RUNNING  1
@@ -157,6 +165,8 @@ struct rcu_node {
        spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
        unsigned long exp_seq_rq;
        wait_queue_head_t exp_wq[4];
+       struct rcu_exp_work rew;
+       bool exp_need_flush;    /* Need to flush workitem? */
 } ____cacheline_internodealigned_in_smp;
 
 /*
index f72eefab854304976c6360a5d9645e5d69bfd130..73e1d3dca5b1993f90c55e1e1d63f1618afff6a5 100644 (file)
@@ -362,93 +362,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
 }
 
 /*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
+ * Select the CPUs within the specified rcu_node that the upcoming
+ * expedited grace period needs to wait for.
  */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
-                                    smp_call_func_t func)
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 {
        int cpu;
        unsigned long flags;
+       smp_call_func_t func;
        unsigned long mask_ofl_test;
        unsigned long mask_ofl_ipi;
        int ret;
-       struct rcu_node *rnp;
+       struct rcu_exp_work *rewp =
+               container_of(wp, struct rcu_exp_work, rew_work);
+       struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+       struct rcu_state *rsp = rewp->rew_rsp;
 
-       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
-       sync_exp_reset_tree(rsp);
-       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
-       rcu_for_each_leaf_node(rsp, rnp) {
-               raw_spin_lock_irqsave_rcu_node(rnp, flags);
-
-               /* Each pass checks a CPU for identity, offline, and idle. */
-               mask_ofl_test = 0;
-               for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-                       unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-                       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-                       struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
-                       int snap;
+       func = rewp->rew_func;
+       raw_spin_lock_irqsave_rcu_node(rnp, flags);
 
-                       if (raw_smp_processor_id() == cpu ||
-                           !(rnp->qsmaskinitnext & mask)) {
+       /* Each pass checks a CPU for identity, offline, and idle. */
+       mask_ofl_test = 0;
+       for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+               unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+               struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+               struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+               int snap;
+
+               if (raw_smp_processor_id() == cpu ||
+                   !(rnp->qsmaskinitnext & mask)) {
+                       mask_ofl_test |= mask;
+               } else {
+                       snap = rcu_dynticks_snap(rdtp);
+                       if (rcu_dynticks_in_eqs(snap))
                                mask_ofl_test |= mask;
-                       } else {
-                               snap = rcu_dynticks_snap(rdtp);
-                               if (rcu_dynticks_in_eqs(snap))
-                                       mask_ofl_test |= mask;
-                               else
-                                       rdp->exp_dynticks_snap = snap;
-                       }
+                       else
+                               rdp->exp_dynticks_snap = snap;
                }
-               mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
-               /*
-                * Need to wait for any blocked tasks as well.  Note that
-                * additional blocking tasks will also block the expedited
-                * GP until such time as the ->expmask bits are cleared.
-                */
-               if (rcu_preempt_has_tasks(rnp))
-                       rnp->exp_tasks = rnp->blkd_tasks.next;
-               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+       }
+       mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
 
-               /* IPI the remaining CPUs for expedited quiescent state. */
-               for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-                       unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-                       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+       /*
+        * Need to wait for any blocked tasks as well.  Note that
+        * additional blocking tasks will also block the expedited GP
+        * until such time as the ->expmask bits are cleared.
+        */
+       if (rcu_preempt_has_tasks(rnp))
+               rnp->exp_tasks = rnp->blkd_tasks.next;
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+       /* IPI the remaining CPUs for expedited quiescent state. */
+       for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+               unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+               struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 
-                       if (!(mask_ofl_ipi & mask))
-                               continue;
+               if (!(mask_ofl_ipi & mask))
+                       continue;
 retry_ipi:
-                       if (rcu_dynticks_in_eqs_since(rdp->dynticks,
-                                                     rdp->exp_dynticks_snap)) {
-                               mask_ofl_test |= mask;
-                               continue;
-                       }
-                       ret = smp_call_function_single(cpu, func, rsp, 0);
-                       if (!ret) {
-                               mask_ofl_ipi &= ~mask;
-                               continue;
-                       }
-                       /* Failed, raced with CPU hotplug operation. */
-                       raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                       if ((rnp->qsmaskinitnext & mask) &&
-                           (rnp->expmask & mask)) {
-                               /* Online, so delay for a bit and try again. */
-                               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                               trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
-                               schedule_timeout_uninterruptible(1);
-                               goto retry_ipi;
-                       }
-                       /* CPU really is offline, so we can ignore it. */
-                       if (!(rnp->expmask & mask))
-                               mask_ofl_ipi &= ~mask;
+               if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+                                             rdp->exp_dynticks_snap)) {
+                       mask_ofl_test |= mask;
+                       continue;
+               }
+               ret = smp_call_function_single(cpu, func, rsp, 0);
+               if (!ret) {
+                       mask_ofl_ipi &= ~mask;
+                       continue;
+               }
+               /* Failed, raced with CPU hotplug operation. */
+               raw_spin_lock_irqsave_rcu_node(rnp, flags);
+               if ((rnp->qsmaskinitnext & mask) &&
+                   (rnp->expmask & mask)) {
+                       /* Online, so delay for a bit and try again. */
                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
+                       schedule_timeout_uninterruptible(1);
+                       goto retry_ipi;
+               }
+               /* CPU really is offline, so we can ignore it. */
+               if (!(rnp->expmask & mask))
+                       mask_ofl_ipi &= ~mask;
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+       }
+       /* Report quiescent states for those that went offline. */
+       mask_ofl_test |= mask_ofl_ipi;
+       if (mask_ofl_test)
+               rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+                                    smp_call_func_t func)
+{
+       struct rcu_node *rnp;
+
+       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+       sync_exp_reset_tree(rsp);
+       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
+
+       /* Schedule work for each leaf rcu_node structure. */
+       rcu_for_each_leaf_node(rsp, rnp) {
+               rnp->exp_need_flush = false;
+               if (!READ_ONCE(rnp->expmask))
+                       continue; /* Avoid early boot non-existent wq. */
+               rnp->rew.rew_func = func;
+               rnp->rew.rew_rsp = rsp;
+               if (!READ_ONCE(rcu_par_gp_wq) ||
+                   rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+                       /* No workqueues yet. */
+                       sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+                       continue;
                }
-               /* Report quiescent states for those that went offline. */
-               mask_ofl_test |= mask_ofl_ipi;
-               if (mask_ofl_test)
-                       rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+               INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+               queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
+               rnp->exp_need_flush = true;
        }
+
+       /* Wait for workqueue jobs (if any) to complete. */
+       rcu_for_each_leaf_node(rsp, rnp)
+               if (rnp->exp_need_flush)
+                       flush_work(&rnp->rew.rew_work);
 }
 
 static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -560,14 +596,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
        mutex_unlock(&rsp->exp_wake_mutex);
 }
 
-/* Let the workqueue handler know what it is supposed to do. */
-struct rcu_exp_work {
-       smp_call_func_t rew_func;
-       struct rcu_state *rew_rsp;
-       unsigned long rew_s;
-       struct work_struct rew_work;
-};
-
 /*
  * Common code to drive an expedited grace period forward, used by
  * workqueues and mid-boot-time tasks.