Skip to content

Commit 2b022c3

Browse files
Brendan Jackmanpundiramit
authored andcommitted
FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide
(from https://patchwork.kernel.org/patch/9895261/) This patch adds a parameter to select_task_rq, sibling_count_hint allowing the caller, where it has this information, to inform the sched_class the number of tasks that are being woken up as part of the same event. The wake_q mechanism is one case where this information is available. select_task_rq_fair can then use the information to detect that it needs to widen the search space for task placement in order to avoid overloading the last-level cache domain's CPUs. * * * The reason I am investigating this change is the following use case on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which all repeatedly do X amount of work then pthread_barrier_wait (i.e. sleep until the last task finishes its X and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU finish faster, and then those CPUs pull over the tasks that are still running: v CPU v ->time-> ------------- 0 (big) 11111 /333 ------------- 1 (big) 22222 /444| ------------- 2 (LITTLE) 333333/ ------------- 3 (LITTLE) 444444/ ------------- Now when task 4 hits the barrier (at |) and wakes the others up, there are 4 tasks with prev_cpu=<big> and 0 tasks with prev_cpu=<little>. want_affine therefore means that we'll only look in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled on the bigs until the next load balance, something like this: v CPU v ->time-> ------------------------ 0 (big) 11111 /333 31313\33333 ------------------------ 1 (big) 22222 /444|424\4444444 ------------------------ 2 (LITTLE) 333333/ \222222 ------------------------ 3 (LITTLE) 444444/ \1111 ------------------------ ^^^ underutilization So, I'm trying to get want_affine = 0 for these tasks. I don't _think_ any incarnation of the wakee_flips mechanism can help us here because which task is waker and which tasks are wakees generally changes with each iteration. However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the nice property that we know exactly how many tasks are being woken, so we can cheat. It might be a disadvantage that we "widen" _every_ task that's woken in an event, while select_idle_sibling would work fine for the first sd_llc_size - 1 tasks. IIUC, if wake_affine() behaves correctly this trick wouldn't be necessary on SMP systems, so it might be best guarded by the presence of SD_ASYM_CPUCAPACITY? * * * Final note.. In order to observe "perfect" behaviour for this use case, I also had to disable the TTWU_QUEUE sched feature. Suppose during the wakeup above we are working through the work queue and have placed tasks 3 and 2, and are about to place task 1: v CPU v ->time-> -------------- 0 (big) 11111 /333 3 -------------- 1 (big) 22222 /444|4 -------------- 2 (LITTLE) 333333/ 2 -------------- 3 (LITTLE) 444444/ <- Task 1 should go here -------------- If TTWU_QUEUE is enabled, we will not yet have enqueued task 2 (having instead sent a reschedule IPI) or attached its load to CPU 2. So we are likely to also place task 1 on cpu 2. Disabling TTWU_QUEUE means that we enqueue task 2 before placing task 1, solving this issue. TTWU_QUEUE is there to minimise rq lock contention, and I guess that this contention is less of an issue on big.LITTLE systems since they have relatively few CPUs, which suggests the trade-off makes sense here. Change-Id: I2080302839a263e0841a89efea8589ea53bbda9c Signed-off-by: Brendan Jackman <brendan.jackman@arm.com> Signed-off-by: Chris Redpath <chris.redpath@arm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Josef Bacik <josef@toxicpanda.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Matt Fleming <matt@codeblueprint.co.uk>
1 parent ae0177a commit 2b022c3

8 files changed

Lines changed: 49 additions & 25 deletions

File tree

include/linux/sched.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -993,12 +993,13 @@ struct wake_q_node {
993993
struct wake_q_head {
994994
struct wake_q_node *first;
995995
struct wake_q_node **lastp;
996+
int count;
996997
};
997998

998999
#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
9991000

10001001
#define WAKE_Q(name) \
1001-
struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
1002+
struct wake_q_head name = { WAKE_Q_TAIL, &name.first, 0 }
10021003

10031004
extern void wake_q_add(struct wake_q_head *head,
10041005
struct task_struct *task);

kernel/sched/core.c

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
546546
if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
547547
return;
548548

549+
head->count++;
550+
549551
get_task_struct(task);
550552

551553
/*
@@ -555,6 +557,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
555557
head->lastp = &node->next;
556558
}
557559

560+
static int
561+
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
562+
int sibling_count_hint);
563+
558564
void wake_up_q(struct wake_q_head *head)
559565
{
560566
struct wake_q_node *node = head->first;
@@ -569,10 +575,10 @@ void wake_up_q(struct wake_q_head *head)
569575
task->wake_q.next = NULL;
570576

571577
/*
572-
* wake_up_process() implies a wmb() to pair with the queueing
578+
* try_to_wake_up() implies a wmb() to pair with the queueing
573579
* in wake_q_add() so as not to miss wakeups.
574580
*/
575-
wake_up_process(task);
581+
try_to_wake_up(task, TASK_NORMAL, 0, head->count);
576582
put_task_struct(task);
577583
}
578584
}
@@ -1642,12 +1648,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
16421648
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
16431649
*/
16441650
static inline
1645-
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1651+
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
1652+
int sibling_count_hint)
16461653
{
16471654
lockdep_assert_held(&p->pi_lock);
16481655

16491656
if (p->nr_cpus_allowed > 1)
1650-
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1657+
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
1658+
sibling_count_hint);
16511659

16521660
/*
16531661
* In order not to call set_task_cpu() on a blocking task we need
@@ -1932,6 +1940,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
19321940
* @p: the thread to be awakened
19331941
* @state: the mask of task states that can be woken
19341942
* @wake_flags: wake modifier flags (WF_*)
1943+
* @sibling_count_hint: A hint at the number of threads that are being woken up
1944+
* in this event.
19351945
*
19361946
* Put it on the run-queue if it's not already there. The "current"
19371947
* thread is always on the run-queue (except when the actual
@@ -1943,7 +1953,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
19431953
* or @state didn't match @p's state.
19441954
*/
19451955
static int
1946-
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1956+
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
1957+
int sibling_count_hint)
19471958
{
19481959
unsigned long flags;
19491960
int cpu, success = 0;
@@ -2044,8 +2055,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
20442055
if (p->sched_class->task_waking)
20452056
p->sched_class->task_waking(p);
20462057

2047-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
2048-
2058+
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2059+
sibling_count_hint);
20492060
if (task_cpu(p) != cpu) {
20502061
wake_flags |= WF_MIGRATED;
20512062
set_task_cpu(p, cpu);
@@ -2127,13 +2138,13 @@ static void try_to_wake_up_local(struct task_struct *p)
21272138
*/
21282139
int wake_up_process(struct task_struct *p)
21292140
{
2130-
return try_to_wake_up(p, TASK_NORMAL, 0);
2141+
return try_to_wake_up(p, TASK_NORMAL, 0, 1);
21312142
}
21322143
EXPORT_SYMBOL(wake_up_process);
21332144

21342145
int wake_up_state(struct task_struct *p, unsigned int state)
21352146
{
2136-
return try_to_wake_up(p, state, 0);
2147+
return try_to_wake_up(p, state, 0, 1);
21372148
}
21382149

21392150
/*
@@ -2467,7 +2478,7 @@ void wake_up_new_task(struct task_struct *p)
24672478
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
24682479
* as we're not fully set-up yet.
24692480
*/
2470-
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2481+
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
24712482
#endif
24722483
rq = __task_rq_lock(p);
24732484
update_rq_clock(rq);
@@ -2905,7 +2916,7 @@ void sched_exec(void)
29052916
int dest_cpu;
29062917

29072918
raw_spin_lock_irqsave(&p->pi_lock, flags);
2908-
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2919+
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
29092920
if (dest_cpu == smp_processor_id())
29102921
goto unlock;
29112922

@@ -3560,7 +3571,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
35603571
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
35613572
void *key)
35623573
{
3563-
return try_to_wake_up(curr->private, mode, wake_flags);
3574+
return try_to_wake_up(curr->private, mode, wake_flags, 1);
35643575
}
35653576
EXPORT_SYMBOL(default_wake_function);
35663577

kernel/sched/deadline.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1070,7 +1070,8 @@ static void yield_task_dl(struct rq *rq)
10701070
static int find_later_rq(struct task_struct *task);
10711071

10721072
static int
1073-
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
1073+
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
1074+
int sibling_count_hint)
10741075
{
10751076
struct task_struct *curr;
10761077
struct rq *rq;

kernel/sched/fair.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5773,15 +5773,18 @@ energy_diff(struct energy_env *eenv)
57735773
* being client/server, worker/dispatcher, interrupt source or whatever is
57745774
* irrelevant, spread criteria is apparent partner count exceeds socket size.
57755775
*/
5776-
static int wake_wide(struct task_struct *p)
5776+
static int wake_wide(struct task_struct *p, int sibling_count_hint)
57775777
{
57785778
unsigned int master = current->wakee_flips;
57795779
unsigned int slave = p->wakee_flips;
5780-
int factor = this_cpu_read(sd_llc_size);
5780+
int llc_size = this_cpu_read(sd_llc_size);
5781+
5782+
if (sibling_count_hint >= llc_size)
5783+
return 1;
57815784

57825785
if (master < slave)
57835786
swap(master, slave);
5784-
if (slave < factor || master < slave * factor)
5787+
if (slave < llc_size || master < slave * llc_size)
57855788
return 0;
57865789
return 1;
57875790
}
@@ -6754,17 +6757,21 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
67546757
* preempt must be disabled.
67556758
*/
67566759
static int
6757-
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6760+
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
6761+
int sibling_count_hint)
67586762
{
67596763
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
67606764
int cpu = smp_processor_id();
67616765
int new_cpu = prev_cpu;
67626766
int want_affine = 0;
67636767
int sync = wake_flags & WF_SYNC;
67646768

6765-
if (sd_flag & SD_BALANCE_WAKE)
6766-
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
6767-
&& cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
6769+
if (sd_flag & SD_BALANCE_WAKE) {
6770+
record_wakee(p);
6771+
want_affine = !wake_wide(p, sibling_count_hint) &&
6772+
!wake_cap(p, cpu, prev_cpu) &&
6773+
cpumask_test_cpu(cpu, &p->cpus_allowed);
6774+
}
67686775

67696776
if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
67706777
return select_energy_cpu_brute(p, prev_cpu, sync);

kernel/sched/idle_task.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99

1010
#ifdef CONFIG_SMP
1111
static int
12-
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
12+
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
13+
int sibling_count_hint)
1314
{
1415
return task_cpu(p); /* IDLE tasks as never migrated */
1516
}

kernel/sched/rt.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1372,7 +1372,8 @@ static void yield_task_rt(struct rq *rq)
13721372
static int find_lowest_rq(struct task_struct *task);
13731373

13741374
static int
1375-
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1375+
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1376+
int sibling_count_hint)
13761377
{
13771378
struct task_struct *curr;
13781379
struct rq *rq;

kernel/sched/sched.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1261,7 +1261,8 @@ struct sched_class {
12611261
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
12621262

12631263
#ifdef CONFIG_SMP
1264-
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1264+
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
1265+
int subling_count_hint);
12651266
void (*migrate_task_rq)(struct task_struct *p);
12661267

12671268
void (*task_waking) (struct task_struct *task);

kernel/sched/stop_task.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212

1313
#ifdef CONFIG_SMP
1414
static int
15-
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
15+
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
16+
int sibling_count_hint)
1617
{
1718
return task_cpu(p); /* stop tasks as never migrate */
1819
}

0 commit comments

Comments
 (0)