sched/fair: Rewrite cfs_rq->removed_*avg
authorPeter Zijlstra <peterz@infradead.org>
Mon, 8 May 2017 14:51:41 +0000 (16:51 +0200)
committerIngo Molnar <mingo@kernel.org>
Fri, 29 Sep 2017 17:35:14 +0000 (19:35 +0200)
Since on wakeup migration we don't hold the rq->lock for the old CPU
we cannot update its state. Instead we add the removed 'load' to an
atomic variable and have the next update on that CPU collect and
process it.

Currently we have 2 atomic variables; which already have the issue
that they can be read out-of-sync. Also, two atomic ops on a single
cacheline is already more expensive than an uncontended lock.

Since we want to add more, convert the thing over to an explicit
cacheline with a lock in.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/sched.h

index 2f93e4a2d9f623915d0023f9b3a7d8b7d7b95cf7..2f22342c48ff4b22bd95046b031943a5aa60a1bf 100644 (file)
@@ -564,10 +564,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        cfs_rq->runnable_load_avg);
        SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
                        cfs_rq->avg.util_avg);
-       SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
-                       atomic_long_read(&cfs_rq->removed_load_avg));
-       SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
-                       atomic_long_read(&cfs_rq->removed_util_avg));
+       SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
+                       cfs_rq->removed.load_avg);
+       SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
+                       cfs_rq->removed.util_avg);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
                        cfs_rq->tg_load_avg_contrib);
index d8c02b31498dd797752deaf4c48f3d770beab60a..fe4a66b104803c6c94a08dc4130db5ea42cf3d5b 100644 (file)
@@ -3501,36 +3501,47 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
+       unsigned long removed_load = 0, removed_util = 0;
        struct sched_avg *sa = &cfs_rq->avg;
-       int decayed, removed_load = 0, removed_util = 0;
+       int decayed = 0;
 
-       if (atomic_long_read(&cfs_rq->removed_load_avg)) {
-               s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+       if (cfs_rq->removed.nr) {
+               unsigned long r;
+
+               raw_spin_lock(&cfs_rq->removed.lock);
+               swap(cfs_rq->removed.util_avg, removed_util);
+               swap(cfs_rq->removed.load_avg, removed_load);
+               cfs_rq->removed.nr = 0;
+               raw_spin_unlock(&cfs_rq->removed.lock);
+
+               /*
+                * The LOAD_AVG_MAX for _sum is a slight over-estimate,
+                * which is safe due to sub_positive() clipping at 0.
+                */
+               r = removed_load;
                sub_positive(&sa->load_avg, r);
                sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
-               removed_load = 1;
-               set_tg_cfs_propagate(cfs_rq);
-       }
 
-       if (atomic_long_read(&cfs_rq->removed_util_avg)) {
-               long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+               r = removed_util;
                sub_positive(&sa->util_avg, r);
                sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
-               removed_util = 1;
+
                set_tg_cfs_propagate(cfs_rq);
+
+               decayed = 1;
        }
 
-       decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+       decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
 
 #ifndef CONFIG_64BIT
        smp_wmb();
        cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-       if (decayed || removed_util)
+       if (decayed)
                cfs_rq_util_change(cfs_rq);
 
-       return decayed || removed_load;
+       return decayed;
 }
 
 /**
@@ -3645,6 +3656,7 @@ void sync_entity_load_avg(struct sched_entity *se)
 void remove_entity_load_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       unsigned long flags;
 
        /*
         * tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3654,11 +3666,19 @@ void remove_entity_load_avg(struct sched_entity *se)
         * Similarly for groups, they will have passed through
         * post_init_entity_util_avg() before unregister_sched_fair_group()
         * calls this.
+        *
+        * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING
+        * we could actually get the right time, since we're called with
+        * rq->lock held, see detach_task().
         */
 
        sync_entity_load_avg(se);
-       atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
-       atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+
+       raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+       ++cfs_rq->removed.nr;
+       cfs_rq->removed.util_avg        += se->avg.util_avg;
+       cfs_rq->removed.load_avg        += se->avg.load_avg;
+       raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -9449,8 +9469,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
        cfs_rq->propagate_avg = 0;
 #endif
-       atomic_long_set(&cfs_rq->removed_load_avg, 0);
-       atomic_long_set(&cfs_rq->removed_util_avg, 0);
+       raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
 }
 
index a5d97460ee4e40e2bb07bfb9a740f47610b2eda3..2fd350a12bb7dbb361ae559b642816846ff1ca25 100644 (file)
@@ -445,14 +445,19 @@ struct cfs_rq {
        struct sched_avg avg;
        u64 runnable_load_sum;
        unsigned long runnable_load_avg;
+#ifndef CONFIG_64BIT
+       u64 load_last_update_time_copy;
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        unsigned long tg_load_avg_contrib;
        unsigned long propagate_avg;
 #endif
-       atomic_long_t removed_load_avg, removed_util_avg;
-#ifndef CONFIG_64BIT
-       u64 load_last_update_time_copy;
-#endif
+       struct {
+               raw_spinlock_t  lock ____cacheline_aligned;
+               int             nr;
+               unsigned long   load_avg;
+               unsigned long   util_avg;
+       } removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /*