forked from Minki/linux
sched: group-scheduler core
Add interface to control cpu bandwidth allocation to task-groups. (not yet configurable, due to missing CONFIG_CONTAINERS) Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
This commit is contained in:
parent
119fe5e068
commit
29f59db3a7
@ -281,6 +281,15 @@ config CPUSETS
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config FAIR_GROUP_SCHED
|
||||
bool "Fair group scheduler"
|
||||
depends on EXPERIMENTAL && CONTAINERS
|
||||
help
|
||||
This option enables you to group tasks and control CPU resource
|
||||
allocation to such groups.
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config SYSFS_DEPRECATED
|
||||
bool "Create deprecated sysfs files"
|
||||
default y
|
||||
|
346
kernel/sched.c
346
kernel/sched.c
@ -171,6 +171,58 @@ struct rt_prio_array {
|
||||
struct list_head queue[MAX_RT_PRIO];
|
||||
};
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
||||
#include <linux/container.h>
|
||||
|
||||
struct cfs_rq;
|
||||
|
||||
/* task group related information */
|
||||
struct task_grp {
|
||||
struct container_subsys_state css;
|
||||
/* schedulable entities of this group on each cpu */
|
||||
struct sched_entity **se;
|
||||
/* runqueue "owned" by this group on each cpu */
|
||||
struct cfs_rq **cfs_rq;
|
||||
unsigned long shares;
|
||||
};
|
||||
|
||||
/* Default task group's sched entity on each cpu */
|
||||
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
|
||||
/* Default task group's cfs_rq on each cpu */
|
||||
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
|
||||
|
||||
static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
|
||||
static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
|
||||
|
||||
/* Default task group.
|
||||
* Every task in system belong to this group at bootup.
|
||||
*/
|
||||
static struct task_grp init_task_grp = {
|
||||
.se = init_sched_entity_p,
|
||||
.cfs_rq = init_cfs_rq_p,
|
||||
};
|
||||
|
||||
/* return group to which a task belongs */
|
||||
static inline struct task_grp *task_grp(struct task_struct *p)
|
||||
{
|
||||
return container_of(task_subsys_state(p, cpu_subsys_id),
|
||||
struct task_grp, css);
|
||||
}
|
||||
|
||||
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
|
||||
static inline void set_task_cfs_rq(struct task_struct *p)
|
||||
{
|
||||
p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
|
||||
p->se.parent = task_grp(p)->se[task_cpu(p)];
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void set_task_cfs_rq(struct task_struct *p) { }
|
||||
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
/* CFS-related fields in a runqueue */
|
||||
struct cfs_rq {
|
||||
struct load_weight load;
|
||||
@ -197,6 +249,7 @@ struct cfs_rq {
|
||||
* list is used during load balance.
|
||||
*/
|
||||
struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
|
||||
struct task_grp *tg; /* group that "owns" this runqueue */
|
||||
#endif
|
||||
};
|
||||
|
||||
@ -419,18 +472,6 @@ unsigned long long cpu_clock(int cpu)
|
||||
return now;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
/* Change a task's ->cfs_rq if it moves across CPUs */
|
||||
static inline void set_task_cfs_rq(struct task_struct *p)
|
||||
{
|
||||
p->se.cfs_rq = &task_rq(p)->cfs;
|
||||
}
|
||||
#else
|
||||
static inline void set_task_cfs_rq(struct task_struct *p)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef prepare_arch_switch
|
||||
# define prepare_arch_switch(next) do { } while (0)
|
||||
#endif
|
||||
@ -970,8 +1011,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
task_thread_info(p)->cpu = cpu;
|
||||
set_task_cfs_rq(p);
|
||||
#endif
|
||||
set_task_cfs_rq(p);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
@ -3885,8 +3926,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
|
||||
oldprio = p->prio;
|
||||
on_rq = p->se.on_rq;
|
||||
if (on_rq)
|
||||
if (on_rq) {
|
||||
dequeue_task(rq, p, 0);
|
||||
if (task_running(rq, p))
|
||||
p->sched_class->put_prev_task(rq, p);
|
||||
}
|
||||
|
||||
if (rt_prio(prio))
|
||||
p->sched_class = &rt_sched_class;
|
||||
@ -3905,6 +3949,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
if (task_running(rq, p)) {
|
||||
if (p->prio > oldprio)
|
||||
resched_task(rq->curr);
|
||||
p->sched_class->set_curr_task(rq);
|
||||
} else {
|
||||
check_preempt_curr(rq, p);
|
||||
}
|
||||
@ -4190,8 +4235,11 @@ recheck:
|
||||
}
|
||||
update_rq_clock(rq);
|
||||
on_rq = p->se.on_rq;
|
||||
if (on_rq)
|
||||
if (on_rq) {
|
||||
deactivate_task(rq, p, 0);
|
||||
if (task_running(rq, p))
|
||||
p->sched_class->put_prev_task(rq, p);
|
||||
}
|
||||
oldprio = p->prio;
|
||||
__setscheduler(rq, p, policy, param->sched_priority);
|
||||
if (on_rq) {
|
||||
@ -4204,6 +4252,7 @@ recheck:
|
||||
if (task_running(rq, p)) {
|
||||
if (p->prio > oldprio)
|
||||
resched_task(rq->curr);
|
||||
p->sched_class->set_curr_task(rq);
|
||||
} else {
|
||||
check_preempt_curr(rq, p);
|
||||
}
|
||||
@ -6444,7 +6493,25 @@ void __init sched_init(void)
|
||||
init_cfs_rq(&rq->cfs, rq);
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
||||
list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
|
||||
{
|
||||
struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
|
||||
struct sched_entity *se =
|
||||
&per_cpu(init_sched_entity, i);
|
||||
|
||||
init_cfs_rq_p[i] = cfs_rq;
|
||||
init_cfs_rq(cfs_rq, rq);
|
||||
cfs_rq->tg = &init_task_grp;
|
||||
list_add(&cfs_rq->leaf_cfs_rq_list,
|
||||
&rq->leaf_cfs_rq_list);
|
||||
|
||||
init_sched_entity_p[i] = se;
|
||||
se->cfs_rq = &rq->cfs;
|
||||
se->my_q = cfs_rq;
|
||||
se->load.weight = NICE_0_LOAD;
|
||||
se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
|
||||
se->parent = NULL;
|
||||
}
|
||||
init_task_grp.shares = NICE_0_LOAD;
|
||||
#endif
|
||||
|
||||
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
|
||||
@ -6632,3 +6699,250 @@ void set_curr_task(int cpu, struct task_struct *p)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
||||
/* return corresponding task_grp object of a container */
|
||||
static inline struct task_grp *container_tg(struct container *cont)
|
||||
{
|
||||
return container_of(container_subsys_state(cont, cpu_subsys_id),
|
||||
struct task_grp, css);
|
||||
}
|
||||
|
||||
/* allocate runqueue etc for a new task group */
|
||||
static struct container_subsys_state *
|
||||
sched_create_group(struct container_subsys *ss, struct container *cont)
|
||||
{
|
||||
struct task_grp *tg;
|
||||
struct cfs_rq *cfs_rq;
|
||||
struct sched_entity *se;
|
||||
int i;
|
||||
|
||||
if (!cont->parent) {
|
||||
/* This is early initialization for the top container */
|
||||
init_task_grp.css.container = cont;
|
||||
return &init_task_grp.css;
|
||||
}
|
||||
|
||||
/* we support only 1-level deep hierarchical scheduler atm */
|
||||
if (cont->parent->parent)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
|
||||
if (!tg)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
|
||||
if (!tg->cfs_rq)
|
||||
goto err;
|
||||
tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
|
||||
if (!tg->se)
|
||||
goto err;
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
|
||||
cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
|
||||
cpu_to_node(i));
|
||||
if (!cfs_rq)
|
||||
goto err;
|
||||
|
||||
se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
|
||||
cpu_to_node(i));
|
||||
if (!se)
|
||||
goto err;
|
||||
|
||||
memset(cfs_rq, 0, sizeof(struct cfs_rq));
|
||||
memset(se, 0, sizeof(struct sched_entity));
|
||||
|
||||
tg->cfs_rq[i] = cfs_rq;
|
||||
init_cfs_rq(cfs_rq, rq);
|
||||
cfs_rq->tg = tg;
|
||||
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
|
||||
|
||||
tg->se[i] = se;
|
||||
se->cfs_rq = &rq->cfs;
|
||||
se->my_q = cfs_rq;
|
||||
se->load.weight = NICE_0_LOAD;
|
||||
se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
|
||||
se->parent = NULL;
|
||||
}
|
||||
|
||||
tg->shares = NICE_0_LOAD;
|
||||
|
||||
/* Bind the container to task_grp object we just created */
|
||||
tg->css.container = cont;
|
||||
|
||||
return &tg->css;
|
||||
|
||||
err:
|
||||
for_each_possible_cpu(i) {
|
||||
if (tg->cfs_rq && tg->cfs_rq[i])
|
||||
kfree(tg->cfs_rq[i]);
|
||||
if (tg->se && tg->se[i])
|
||||
kfree(tg->se[i]);
|
||||
}
|
||||
if (tg->cfs_rq)
|
||||
kfree(tg->cfs_rq);
|
||||
if (tg->se)
|
||||
kfree(tg->se);
|
||||
if (tg)
|
||||
kfree(tg);
|
||||
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
|
||||
/* destroy runqueue etc associated with a task group */
|
||||
static void sched_destroy_group(struct container_subsys *ss,
|
||||
struct container *cont)
|
||||
{
|
||||
struct task_grp *tg = container_tg(cont);
|
||||
struct cfs_rq *cfs_rq;
|
||||
struct sched_entity *se;
|
||||
int i;
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
cfs_rq = tg->cfs_rq[i];
|
||||
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
|
||||
}
|
||||
|
||||
/* wait for possible concurrent references to cfs_rqs complete */
|
||||
synchronize_sched();
|
||||
|
||||
/* now it should be safe to free those cfs_rqs */
|
||||
for_each_possible_cpu(i) {
|
||||
cfs_rq = tg->cfs_rq[i];
|
||||
kfree(cfs_rq);
|
||||
|
||||
se = tg->se[i];
|
||||
kfree(se);
|
||||
}
|
||||
|
||||
kfree(tg->cfs_rq);
|
||||
kfree(tg->se);
|
||||
kfree(tg);
|
||||
}
|
||||
|
||||
static int sched_can_attach(struct container_subsys *ss,
|
||||
struct container *cont, struct task_struct *tsk)
|
||||
{
|
||||
/* We don't support RT-tasks being in separate groups */
|
||||
if (tsk->sched_class != &fair_sched_class)
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* change task's runqueue when it moves between groups */
|
||||
static void sched_move_task(struct container_subsys *ss, struct container *cont,
|
||||
struct container *old_cont, struct task_struct *tsk)
|
||||
{
|
||||
int on_rq, running;
|
||||
unsigned long flags;
|
||||
struct rq *rq;
|
||||
|
||||
rq = task_rq_lock(tsk, &flags);
|
||||
|
||||
if (tsk->sched_class != &fair_sched_class)
|
||||
goto done;
|
||||
|
||||
update_rq_clock(rq);
|
||||
|
||||
running = task_running(rq, tsk);
|
||||
on_rq = tsk->se.on_rq;
|
||||
|
||||
if (on_rq) {
|
||||
dequeue_task(rq, tsk, 0);
|
||||
if (unlikely(running))
|
||||
tsk->sched_class->put_prev_task(rq, tsk);
|
||||
}
|
||||
|
||||
set_task_cfs_rq(tsk);
|
||||
|
||||
if (on_rq) {
|
||||
enqueue_task(rq, tsk, 0);
|
||||
if (unlikely(running))
|
||||
tsk->sched_class->set_curr_task(rq);
|
||||
}
|
||||
|
||||
done:
|
||||
task_rq_unlock(rq, &flags);
|
||||
}
|
||||
|
||||
static void set_se_shares(struct sched_entity *se, unsigned long shares)
|
||||
{
|
||||
struct cfs_rq *cfs_rq = se->cfs_rq;
|
||||
struct rq *rq = cfs_rq->rq;
|
||||
int on_rq;
|
||||
|
||||
spin_lock_irq(&rq->lock);
|
||||
|
||||
on_rq = se->on_rq;
|
||||
if (on_rq)
|
||||
dequeue_entity(cfs_rq, se, 0);
|
||||
|
||||
se->load.weight = shares;
|
||||
se->load.inv_weight = div64_64((1ULL<<32), shares);
|
||||
|
||||
if (on_rq)
|
||||
enqueue_entity(cfs_rq, se, 0);
|
||||
|
||||
spin_unlock_irq(&rq->lock);
|
||||
}
|
||||
|
||||
static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype,
|
||||
struct file *file, const char __user *userbuf,
|
||||
size_t nbytes, loff_t *ppos)
|
||||
{
|
||||
int i;
|
||||
unsigned long shareval;
|
||||
struct task_grp *tg = container_tg(cont);
|
||||
char buffer[2*sizeof(unsigned long) + 1];
|
||||
|
||||
if (nbytes > 2*sizeof(unsigned long)) /* safety check */
|
||||
return -E2BIG;
|
||||
|
||||
if (copy_from_user(buffer, userbuf, nbytes))
|
||||
return -EFAULT;
|
||||
|
||||
buffer[nbytes] = 0; /* nul-terminate */
|
||||
shareval = simple_strtoul(buffer, NULL, 10);
|
||||
|
||||
tg->shares = shareval;
|
||||
for_each_possible_cpu(i)
|
||||
set_se_shares(tg->se[i], shareval);
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft)
|
||||
{
|
||||
struct task_grp *tg = container_tg(cont);
|
||||
|
||||
return (u64) tg->shares;
|
||||
}
|
||||
|
||||
struct cftype cpuctl_share = {
|
||||
.name = "shares",
|
||||
.read_uint = cpu_shares_read_uint,
|
||||
.write = cpu_shares_write,
|
||||
};
|
||||
|
||||
static int sched_populate(struct container_subsys *ss, struct container *cont)
|
||||
{
|
||||
return container_add_file(cont, ss, &cpuctl_share);
|
||||
}
|
||||
|
||||
struct container_subsys cpu_subsys = {
|
||||
.name = "cpu",
|
||||
.create = sched_create_group,
|
||||
.destroy = sched_destroy_group,
|
||||
.can_attach = sched_can_attach,
|
||||
.attach = sched_move_task,
|
||||
.populate = sched_populate,
|
||||
.subsys_id = cpu_subsys_id,
|
||||
.early_init = 1,
|
||||
};
|
||||
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
@ -610,8 +610,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
|
||||
*/
|
||||
static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
|
||||
{
|
||||
/* A later patch will take group into account */
|
||||
return &cpu_rq(this_cpu)->cfs;
|
||||
return cfs_rq->tg->cfs_rq[this_cpu];
|
||||
}
|
||||
|
||||
/* Iterate thr' all leaf cfs_rq's on a runqueue */
|
||||
|
@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
|
||||
{
|
||||
}
|
||||
|
||||
static void set_curr_task_idle(struct rq *rq)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Simple, special scheduling class for the per-CPU idle tasks:
|
||||
*/
|
||||
@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = {
|
||||
|
||||
.load_balance = load_balance_idle,
|
||||
|
||||
.set_curr_task = set_curr_task_idle,
|
||||
.task_tick = task_tick_idle,
|
||||
/* no .task_new for idle tasks */
|
||||
};
|
||||
|
@ -218,6 +218,10 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
|
||||
}
|
||||
}
|
||||
|
||||
static void set_curr_task_rt(struct rq *rq)
|
||||
{
|
||||
}
|
||||
|
||||
static struct sched_class rt_sched_class __read_mostly = {
|
||||
.enqueue_task = enqueue_task_rt,
|
||||
.dequeue_task = dequeue_task_rt,
|
||||
@ -230,5 +234,6 @@ static struct sched_class rt_sched_class __read_mostly = {
|
||||
|
||||
.load_balance = load_balance_rt,
|
||||
|
||||
.set_curr_task = set_curr_task_rt,
|
||||
.task_tick = task_tick_rt,
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user