Skip to content

Commit 630948e

Browse files
Peter Zijlstrapundiramit
authored andcommitted
BACKPORT: sched/fair: Fix PELT integrity for new tasks
Vincent and Yuyang found another few scenarios in which entity tracking goes wobbly. The scenarios are basically due to the fact that new tasks are not immediately attached and thereby differ from the normal situation -- a task is always attached to a cfs_rq load average (such that it includes its blocked contribution) and are explicitly detached/attached on migration to another cfs_rq. Scenario 1: switch to fair class p->sched_class = fair_class; if (queued) enqueue_task(p); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() check_class_changed() switched_from() (!fair) switched_to() (fair) switched_to_fair() attach_entity_load_avg() If @p is a new task that hasn't been fair before, it will have !last_update_time and, per the above, end up in attach_entity_load_avg() _twice_. Scenario 2: change between cgroups sched_move_group(p) if (queued) dequeue_task() task_move_group_fair() detach_task_cfs_rq() detach_entity_load_avg() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() if (queued) enqueue_task(); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() Similar as with scenario 1, if @p is a new task, it will have !load_update_time and we'll end up in attach_entity_load_avg() _twice_. Furthermore, notice how we do a detach_entity_load_avg() on something that wasn't attached to begin with. As stated above; the problem is that the new task isn't yet attached to the load tracking and thereby violates the invariant assumption. This patch remedies this by ensuring a new task is indeed properly attached to the load tracking on creation, through post_init_entity_util_avg(). Of course, this isn't entirely as straightforward as one might think, since the task is hashed before we call wake_up_new_task() and thus can be poked at. We avoid this by adding TASK_NEW and teaching cpu_cgroup_can_attach() to refuse such tasks. .:: BACKPORT Complicated by the fact that mch of the lines changed by the original of this commit were then changed by: df217913e72e sched/fair: Factorize attach/detach entity <Vincent Guittot> and then d31b1a66cbe0 sched/fair: Factorize PELT update <Vincent Guittot> , which have both already been backported here. Reported-by: Yuyang Du <yuyang.du@intel.com> Reported-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> (cherry picked from commit 7dc603c9028ea5d4354e0e317e8481df99b06d7e) Change-Id: Ibc59eb52310a62709d49a744bd5a24e8b97c4ae8 Signed-off-by: Brendan Jackman <brendan.jackman@arm.com> Signed-off-by: Chris Redpath <chris.redpath@arm.com>
1 parent 73f39e4 commit 630948e

3 files changed

Lines changed: 36 additions & 10 deletions

File tree

include/linux/sched.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,9 +222,10 @@ extern void proc_sched_set_task(struct task_struct *p);
222222
#define TASK_WAKING 256
223223
#define TASK_PARKED 512
224224
#define TASK_NOLOAD 1024
225-
#define TASK_STATE_MAX 2048
225+
#define TASK_NEW 2048
226+
#define TASK_STATE_MAX 4096
226227

227-
#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
228+
#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
228229

229230
extern char ___assert_task_state[1 - 2*!!(
230231
sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];

kernel/sched/core.c

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2259,11 +2259,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
22592259

22602260
__sched_fork(clone_flags, p);
22612261
/*
2262-
* We mark the process as running here. This guarantees that
2262+
* We mark the process as NEW here. This guarantees that
22632263
* nobody will actually run it, and a signal or other external
22642264
* event cannot wake it up and insert it on the runqueue either.
22652265
*/
2266-
p->state = TASK_RUNNING;
2266+
p->state = TASK_NEW;
22672267

22682268
/*
22692269
* Make sure we do not leak PI boosting priority to the child.
@@ -2300,6 +2300,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
23002300
p->sched_class = &fair_sched_class;
23012301
}
23022302

2303+
init_entity_runnable_average(&p->se);
2304+
23032305
/*
23042306
* The child is not yet in the pid-hash so no cgroup attach races,
23052307
* and the cgroup is pinned to this child due to cgroup_fork()
@@ -2446,6 +2448,7 @@ void wake_up_new_task(struct task_struct *p)
24462448
struct rq *rq;
24472449

24482450
raw_spin_lock_irqsave(&p->pi_lock, flags);
2451+
p->state = TASK_RUNNING;
24492452

24502453
walt_init_new_task_load(p);
24512454

@@ -8690,6 +8693,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
86908693
{
86918694
struct task_struct *task;
86928695
struct cgroup_subsys_state *css;
8696+
int ret = 0;
86938697

86948698
cgroup_taskset_for_each(task, css, tset) {
86958699
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8700,8 +8704,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
87008704
if (task->sched_class != &fair_sched_class)
87018705
return -EINVAL;
87028706
#endif
8707+
/*
8708+
* Serialize against wake_up_new_task() such that if its
8709+
* running, we're sure to observe its full state.
8710+
*/
8711+
raw_spin_lock_irq(&task->pi_lock);
8712+
/*
8713+
* Avoid calling sched_move_task() before wake_up_new_task()
8714+
* has happened. This would lead to problems with PELT, due to
8715+
* move wanting to detach+attach while we're not attached yet.
8716+
*/
8717+
if (task->state == TASK_NEW)
8718+
ret = -EINVAL;
8719+
raw_spin_unlock_irq(&task->pi_lock);
8720+
8721+
if (ret)
8722+
break;
87038723
}
8704-
return 0;
8724+
return ret;
87058725
}
87068726

87078727
static void cpu_cgroup_attach(struct cgroup_taskset *tset)

kernel/sched/fair.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,9 @@ void init_entity_runnable_average(struct sched_entity *se)
766766
}
767767

768768
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
769+
static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
769770
static void attach_entity_cfs_rq(struct sched_entity *se);
771+
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
770772

771773
/*
772774
* With new tasks being created, their initial util_avgs are extrapolated
@@ -837,7 +839,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
837839
attach_entity_cfs_rq(se);
838840
}
839841

840-
#else
842+
#else /* !CONFIG_SMP */
841843
void init_entity_runnable_average(struct sched_entity *se)
842844
{
843845
}
@@ -3312,11 +3314,14 @@ void remove_entity_load_avg(struct sched_entity *se)
33123314
struct cfs_rq *cfs_rq = cfs_rq_of(se);
33133315

33143316
/*
3315-
* Newly created task or never used group entity should not be removed
3316-
* from its (source) cfs_rq
3317+
* tasks cannot exit without having gone through wake_up_new_task() ->
3318+
* post_init_entity_util_avg() which will have added things to the
3319+
* cfs_rq, so we can remove unconditionally.
3320+
*
3321+
* Similarly for groups, they will have passed through
3322+
* post_init_entity_util_avg() before unregister_sched_fair_group()
3323+
* calls this.
33173324
*/
3318-
if (se->avg.last_update_time == 0)
3319-
return;
33203325

33213326
sync_entity_load_avg(se);
33223327
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);

0 commit comments

Comments
 (0)