From 144d8487bc6e9b741895709cb46d4e19b748a725 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 May 2017 17:57:24 +0200 Subject: [PATCH] sched/fair: Implement synchonous PELT detach on load-balance migrate Vincent wondered why his self migrating task had a roughly 50% dip in load_avg when landing on the new CPU. This is because we uncondionally take the asynchronous detatch_entity route, which can lead to the attach on the new CPU still seeing the old CPU's contribution to tg->load_avg, effectively halving the new CPU's shares. While in general this is something we have to live with, there is the special case of runnable migration where we can do better. Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10d2000fca2d..92dbcc0fea46 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3746,10 +3746,6 @@ void remove_entity_load_avg(struct sched_entity *se) * Similarly for groups, they will have passed through * post_init_entity_util_avg() before unregister_sched_fair_group() * calls this. - * - * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING - * we could actually get the right time, since we're called with - * rq->lock held, see detach_task(). */ sync_entity_load_avg(se); @@ -6292,6 +6288,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return new_cpu; } +static void detach_entity_cfs_rq(struct sched_entity *se); + /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -6325,14 +6323,25 @@ static void migrate_task_rq_fair(struct task_struct *p) se->vruntime -= min_vruntime; } - /* - * We are supposed to update the task to "current" time, then its up to date - * and ready to go to new CPU/cfs_rq. But we have difficulty in getting - * what current time is, so simply throw away the out-of-date time. This - * will result in the wakee task is less decayed, but giving the wakee more - * load sounds not bad. - */ - remove_entity_load_avg(&p->se); + if (p->on_rq == TASK_ON_RQ_MIGRATING) { + /* + * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' + * rq->lock and can modify state directly. + */ + lockdep_assert_held(&task_rq(p)->lock); + detach_entity_cfs_rq(&p->se); + + } else { + /* + * We are supposed to update the task to "current" time, then + * its up to date and ready to go to new CPU/cfs_rq. But we + * have difficulty in getting what current time is, so simply + * throw away the out-of-date time. This will result in the + * wakee task is less decayed, but giving the wakee more load + * sounds not bad. + */ + remove_entity_load_avg(&p->se); + } /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; -- 2.30.2