forked from Minki/linux
10e2f1acd0
select_idle_siblings() is a known pain point for a number of workloads; it either does too much or not enough and sometimes just does plain wrong. This rewrite attempts to address a number of issues (but sadly not all). The current code does an unconditional sched_domain iteration; with the intent of finding an idle core (on SMT hardware). The problems which this patch tries to address are: - its pointless to look for idle cores if the machine is real busy; at which point you're just wasting cycles. - it's behaviour is inconsistent between SMT and !SMT hardware in that !SMT hardware ends up doing a scan for any idle CPU in the LLC domain, while SMT hardware does a scan for idle cores and if that fails, falls back to a scan for idle threads on the 'target' core. The new code replaces the sched_domain scan with 3 explicit scans: 1) search for an idle core in the LLC 2) search for an idle CPU in the LLC 3) search for an idle thread in the 'target' core where 1 and 3 are conditional on SMT support and 1 and 2 have runtime heuristics to skip the step. Step 1) is conditional on sd_llc_shared->has_idle_cores; when a cpu goes idle and sd_llc_shared->has_idle_cores is false, we scan all SMT siblings of the CPU going idle. Similarly, we clear sd_llc_shared->has_idle_cores when we fail to find an idle core. Step 2) tracks the average cost of the scan and compares this to the average idle time guestimate for the CPU doing the wakeup. There is a significant fudge factor involved to deal with the variability of the averages. Esp. hackbench was sensitive to this. Step 3) is unconditional; we assume (also per step 1) that scanning all SMT siblings in a core is 'cheap'. With this; SMT systems gain step 2, which cures a few benchmarks -- notably one from Facebook. One 'feature' of the sched_domain iteration, which we preserve in the new code, is that it would start scanning from the 'target' CPU, instead of scanning the cpumask in cpu id order. This avoids multiple CPUs in the LLC scanning for idle to gang up and find the same CPU quite as much. The down side is that tasks can end up hopping across the LLC for no apparent reason. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
110 lines
2.3 KiB
C
110 lines
2.3 KiB
C
#include "sched.h"
|
|
|
|
/*
|
|
* idle-task scheduling class.
|
|
*
|
|
* (NOTE: these are not related to SCHED_IDLE tasks which are
|
|
* handled in sched/fair.c)
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
static int
|
|
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
|
|
{
|
|
return task_cpu(p); /* IDLE tasks as never migrated */
|
|
}
|
|
#endif /* CONFIG_SMP */
|
|
|
|
/*
|
|
* Idle tasks are unconditionally rescheduled:
|
|
*/
|
|
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
resched_curr(rq);
|
|
}
|
|
|
|
static struct task_struct *
|
|
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
|
|
{
|
|
put_prev_task(rq, prev);
|
|
update_idle_core(rq);
|
|
schedstat_inc(rq->sched_goidle);
|
|
return rq->idle;
|
|
}
|
|
|
|
/*
|
|
* It is not legal to sleep in the idle task - print a warning
|
|
* message if some code attempts to do it:
|
|
*/
|
|
static void
|
|
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
raw_spin_unlock_irq(&rq->lock);
|
|
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
|
|
dump_stack();
|
|
raw_spin_lock_irq(&rq->lock);
|
|
}
|
|
|
|
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
|
|
{
|
|
rq_last_tick_reset(rq);
|
|
}
|
|
|
|
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
|
|
{
|
|
}
|
|
|
|
static void set_curr_task_idle(struct rq *rq)
|
|
{
|
|
}
|
|
|
|
static void switched_to_idle(struct rq *rq, struct task_struct *p)
|
|
{
|
|
BUG();
|
|
}
|
|
|
|
static void
|
|
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
|
|
{
|
|
BUG();
|
|
}
|
|
|
|
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static void update_curr_idle(struct rq *rq)
|
|
{
|
|
}
|
|
|
|
/*
|
|
* Simple, special scheduling class for the per-CPU idle tasks:
|
|
*/
|
|
const struct sched_class idle_sched_class = {
|
|
/* .next is NULL */
|
|
/* no enqueue/yield_task for idle tasks */
|
|
|
|
/* dequeue is not valid, we print a debug message there: */
|
|
.dequeue_task = dequeue_task_idle,
|
|
|
|
.check_preempt_curr = check_preempt_curr_idle,
|
|
|
|
.pick_next_task = pick_next_task_idle,
|
|
.put_prev_task = put_prev_task_idle,
|
|
|
|
#ifdef CONFIG_SMP
|
|
.select_task_rq = select_task_rq_idle,
|
|
.set_cpus_allowed = set_cpus_allowed_common,
|
|
#endif
|
|
|
|
.set_curr_task = set_curr_task_idle,
|
|
.task_tick = task_tick_idle,
|
|
|
|
.get_rr_interval = get_rr_interval_idle,
|
|
|
|
.prio_changed = prio_changed_idle,
|
|
.switched_to = switched_to_idle,
|
|
.update_curr = update_curr_idle,
|
|
};
|