sched/fair: Rewrite PELT migration propagation
authorPeter Zijlstra <peterz@infradead.org>
Mon, 8 May 2017 15:30:46 +0000 (17:30 +0200)
committerIngo Molnar <mingo@kernel.org>
Fri, 29 Sep 2017 17:35:15 +0000 (19:35 +0200)
When an entity migrates in (or out) of a runqueue, we need to add (or
remove) its contribution from the entire PELT hierarchy, because even
non-runnable entities are included in the load average sums.

In order to do this we have some propagation logic that updates the
PELT tree, however the way it 'propagates' the runnable (or load)
change is (more or less):

                     tg->weight * grq->avg.load_avg
  ge->avg.load_avg = ------------------------------
                               tg->load_avg

But that is the expression for ge->weight, and per the definition of
load_avg:

  ge->avg.load_avg := ge->weight * ge->avg.runnable_avg

That destroys the runnable_avg (by setting it to 1) we wanted to
propagate.

Instead directly propagate runnable_sum.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/sched.h

index 2f22342c48ff4b22bd95046b031943a5aa60a1bf..2e039a81864c1f7bb70cf95f2661685bcb9e822a 100644 (file)
@@ -568,6 +568,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        cfs_rq->removed.load_avg);
        SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
                        cfs_rq->removed.util_avg);
+       SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_sum",
+                       cfs_rq->removed.runnable_sum);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
                        cfs_rq->tg_load_avg_contrib);
index fe4a66b104803c6c94a08dc4130db5ea42cf3d5b..086a5d979720670283177da4cb60e7df009631ee 100644 (file)
@@ -3319,11 +3319,77 @@ void set_task_rq_fair(struct sched_entity *se,
        se->avg.last_update_time = n_last_update_time;
 }
 
-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ *   ge->avg == grq->avg                                               (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg         (2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg       (3)
+ *
+ * And per (1) we have:
+ *
+ *   ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ *                      ge->load.weight * grq->avg.load_avg
+ *   ge->avg.load_avg = -----------------------------------            (4)
+ *                               grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ *   grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ *   d(runnable_avg) += se->avg.load_avg / se->load.weight
+ *                                                                (5)
+ *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
 static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
        long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
 
        /* Nothing to update */
@@ -3339,102 +3405,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
        cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
 }
 
-/* Take into account change of load of a child task group */
 static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-       long delta, load = gcfs_rq->avg.load_avg;
+       long runnable_sum = gcfs_rq->prop_runnable_sum;
+       long load_avg;
+       s64 load_sum;
 
-       /*
-        * If the load of group cfs_rq is null, the load of the
-        * sched_entity will also be null so we can skip the formula
-        */
-       if (load) {
-               long tg_load;
-
-               /* Get tg's load and ensure tg_load > 0 */
-               tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
-               /* Ensure tg_load >= load and updated with current load*/
-               tg_load -= gcfs_rq->tg_load_avg_contrib;
-               tg_load += load;
-
-               /*
-                * We need to compute a correction term in the case that the
-                * task group is consuming more CPU than a task of equal
-                * weight. A task with a weight equals to tg->shares will have
-                * a load less or equal to scale_load_down(tg->shares).
-                * Similarly, the sched_entities that represent the task group
-                * at parent level, can't have a load higher than
-                * scale_load_down(tg->shares). And the Sum of sched_entities'
-                * load must be <= scale_load_down(tg->shares).
-                */
-               if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
-                       /* scale gcfs_rq's load into tg's shares*/
-                       load *= scale_load_down(gcfs_rq->tg->shares);
-                       load /= tg_load;
-               }
-       }
+       if (!runnable_sum)
+               return;
 
-       delta = load - se->avg.load_avg;
+       gcfs_rq->prop_runnable_sum = 0;
 
-       /* Nothing to update */
-       if (!delta)
-               return;
+       load_sum = (s64)se_weight(se) * runnable_sum;
+       load_avg = div_s64(load_sum, LOAD_AVG_MAX);
 
-       /* Set new sched_entity's load */
-       se->avg.load_avg = load;
-       se->avg.load_sum = LOAD_AVG_MAX;
+       add_positive(&se->avg.load_sum, runnable_sum);
+       add_positive(&se->avg.load_avg, load_avg);
 
-       /* Update parent cfs_rq load */
-       add_positive(&cfs_rq->avg.load_avg, delta);
-       cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+       add_positive(&cfs_rq->avg.load_avg, load_avg);
+       add_positive(&cfs_rq->avg.load_sum, load_sum);
 
-       /*
-        * If the sched_entity is already enqueued, we also have to update the
-        * runnable load avg.
-        */
        if (se->on_rq) {
-               /* Update parent cfs_rq runnable_load_avg */
-               add_positive(&cfs_rq->runnable_load_avg, delta);
-               cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+               add_positive(&cfs_rq->runnable_load_avg, load_avg);
+               add_positive(&cfs_rq->runnable_load_sum, load_sum);
        }
 }
 
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
 {
-       cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
-{
-       struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
-       if (!cfs_rq->propagate_avg)
-               return 0;
-
-       cfs_rq->propagate_avg = 0;
-       return 1;
+       cfs_rq->propagate = 1;
+       cfs_rq->prop_runnable_sum += runnable_sum;
 }
 
 /* Update task and its cfs_rq load average */
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *gcfs_rq;
 
        if (entity_is_task(se))
                return 0;
 
-       if (!test_and_clear_tg_cfs_propagate(se))
+       gcfs_rq = group_cfs_rq(se);
+       if (!gcfs_rq->propagate)
                return 0;
 
+       gcfs_rq->propagate = 0;
+
        cfs_rq = cfs_rq_of(se);
 
-       set_tg_cfs_propagate(cfs_rq);
+       add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
 
-       update_tg_cfs_util(cfs_rq, se);
-       update_tg_cfs_load(cfs_rq, se);
+       update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+       update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
 
        return 1;
 }
@@ -3458,7 +3481,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
         * If there is a pending propagation, we have to update the load and
         * the utilization of the sched_entity:
         */
-       if (gcfs_rq->propagate_avg)
+       if (gcfs_rq->propagate)
                return false;
 
        /*
@@ -3478,7 +3501,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
        return 0;
 }
 
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -3501,7 +3524,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
-       unsigned long removed_load = 0, removed_util = 0;
+       unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
        struct sched_avg *sa = &cfs_rq->avg;
        int decayed = 0;
 
@@ -3511,6 +3534,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
                raw_spin_lock(&cfs_rq->removed.lock);
                swap(cfs_rq->removed.util_avg, removed_util);
                swap(cfs_rq->removed.load_avg, removed_load);
+               swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
                cfs_rq->removed.nr = 0;
                raw_spin_unlock(&cfs_rq->removed.lock);
 
@@ -3526,7 +3550,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
                sub_positive(&sa->util_avg, r);
                sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
 
-               set_tg_cfs_propagate(cfs_rq);
+               add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
 
                decayed = 1;
        }
@@ -3558,7 +3582,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        enqueue_load_avg(cfs_rq, se);
        cfs_rq->avg.util_avg += se->avg.util_avg;
        cfs_rq->avg.util_sum += se->avg.util_sum;
-       set_tg_cfs_propagate(cfs_rq);
+
+       add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 
        cfs_rq_util_change(cfs_rq);
 }
@@ -3576,7 +3601,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        dequeue_load_avg(cfs_rq, se);
        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
        sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
-       set_tg_cfs_propagate(cfs_rq);
+
+       add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 
        cfs_rq_util_change(cfs_rq);
 }
@@ -3678,6 +3704,7 @@ void remove_entity_load_avg(struct sched_entity *se)
        ++cfs_rq->removed.nr;
        cfs_rq->removed.util_avg        += se->avg.util_avg;
        cfs_rq->removed.load_avg        += se->avg.load_avg;
+       cfs_rq->removed.runnable_sum    += se->avg.load_sum; /* == runnable_sum */
        raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
@@ -9466,9 +9493,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       cfs_rq->propagate_avg = 0;
-#endif
        raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
 }
index 2fd350a12bb7dbb361ae559b642816846ff1ca25..5bcb86eb026b171e5dcfb2d3bb5e29d2c6a9776e 100644 (file)
@@ -447,19 +447,20 @@ struct cfs_rq {
        unsigned long runnable_load_avg;
 #ifndef CONFIG_64BIT
        u64 load_last_update_time_copy;
-#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       unsigned long tg_load_avg_contrib;
-       unsigned long propagate_avg;
 #endif
        struct {
                raw_spinlock_t  lock ____cacheline_aligned;
                int             nr;
                unsigned long   load_avg;
                unsigned long   util_avg;
+               unsigned long   runnable_sum;
        } removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+       unsigned long tg_load_avg_contrib;
+       long propagate;
+       long prop_runnable_sum;
+
        /*
         *   h_load = weight * f(tg)
         *