sched: group-scheduler core

Add interface to control cpu bandwidth allocation to task-groups.

(not yet configurable, due to missing CONFIG_CONTAINERS)

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
This commit is contained in:
Srivatsa Vaddagiri 2007-10-15 17:00:07 +02:00 committed by Ingo Molnar
parent 119fe5e068
commit 29f59db3a7
5 changed files with 350 additions and 18 deletions

View File

@ -281,6 +281,15 @@ config CPUSETS
Say N if unsure.
config FAIR_GROUP_SCHED
bool "Fair group scheduler"
depends on EXPERIMENTAL && CONTAINERS
help
This option enables you to group tasks and control CPU resource
allocation to such groups.
Say N if unsure.
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
default y

View File

@ -171,6 +171,58 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};
#ifdef CONFIG_FAIR_GROUP_SCHED
#include <linux/container.h>
struct cfs_rq;
/* task group related information */
struct task_grp {
struct container_subsys_state css;
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
};
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
/* Default task group.
* Every task in system belong to this group at bootup.
*/
static struct task_grp init_task_grp = {
.se = init_sched_entity_p,
.cfs_rq = init_cfs_rq_p,
};
/* return group to which a task belongs */
static inline struct task_grp *task_grp(struct task_struct *p)
{
return container_of(task_subsys_state(p, cpu_subsys_id),
struct task_grp, css);
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_cfs_rq(struct task_struct *p)
{
p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
p->se.parent = task_grp(p)->se[task_cpu(p)];
}
#else
static inline void set_task_cfs_rq(struct task_struct *p) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
@ -197,6 +249,7 @@ struct cfs_rq {
* list is used during load balance.
*/
struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
struct task_grp *tg; /* group that "owns" this runqueue */
#endif
};
@ -419,18 +472,6 @@ unsigned long long cpu_clock(int cpu)
return now;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Change a task's ->cfs_rq if it moves across CPUs */
static inline void set_task_cfs_rq(struct task_struct *p)
{
p->se.cfs_rq = &task_rq(p)->cfs;
}
#else
static inline void set_task_cfs_rq(struct task_struct *p)
{
}
#endif
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
@ -970,8 +1011,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_SMP
task_thread_info(p)->cpu = cpu;
set_task_cfs_rq(p);
#endif
set_task_cfs_rq(p);
}
#ifdef CONFIG_SMP
@ -3885,8 +3926,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
oldprio = p->prio;
on_rq = p->se.on_rq;
if (on_rq)
if (on_rq) {
dequeue_task(rq, p, 0);
if (task_running(rq, p))
p->sched_class->put_prev_task(rq, p);
}
if (rt_prio(prio))
p->sched_class = &rt_sched_class;
@ -3905,6 +3949,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (task_running(rq, p)) {
if (p->prio > oldprio)
resched_task(rq->curr);
p->sched_class->set_curr_task(rq);
} else {
check_preempt_curr(rq, p);
}
@ -4190,8 +4235,11 @@ recheck:
}
update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
if (on_rq) {
deactivate_task(rq, p, 0);
if (task_running(rq, p))
p->sched_class->put_prev_task(rq, p);
}
oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority);
if (on_rq) {
@ -4204,6 +4252,7 @@ recheck:
if (task_running(rq, p)) {
if (p->prio > oldprio)
resched_task(rq->curr);
p->sched_class->set_curr_task(rq);
} else {
check_preempt_curr(rq, p);
}
@ -6444,7 +6493,25 @@ void __init sched_init(void)
init_cfs_rq(&rq->cfs, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
{
struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
struct sched_entity *se =
&per_cpu(init_sched_entity, i);
init_cfs_rq_p[i] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = &init_task_grp;
list_add(&cfs_rq->leaf_cfs_rq_list,
&rq->leaf_cfs_rq_list);
init_sched_entity_p[i] = se;
se->cfs_rq = &rq->cfs;
se->my_q = cfs_rq;
se->load.weight = NICE_0_LOAD;
se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
se->parent = NULL;
}
init_task_grp.shares = NICE_0_LOAD;
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@ -6632,3 +6699,250 @@ void set_curr_task(int cpu, struct task_struct *p)
}
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
/* return corresponding task_grp object of a container */
static inline struct task_grp *container_tg(struct container *cont)
{
return container_of(container_subsys_state(cont, cpu_subsys_id),
struct task_grp, css);
}
/* allocate runqueue etc for a new task group */
static struct container_subsys_state *
sched_create_group(struct container_subsys *ss, struct container *cont)
{
struct task_grp *tg;
struct cfs_rq *cfs_rq;
struct sched_entity *se;
int i;
if (!cont->parent) {
/* This is early initialization for the top container */
init_task_grp.css.container = cont;
return &init_task_grp.css;
}
/* we support only 1-level deep hierarchical scheduler atm */
if (cont->parent->parent)
return ERR_PTR(-EINVAL);
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
return ERR_PTR(-ENOMEM);
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
if (!tg->se)
goto err;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
cpu_to_node(i));
if (!cfs_rq)
goto err;
se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
cpu_to_node(i));
if (!se)
goto err;
memset(cfs_rq, 0, sizeof(struct cfs_rq));
memset(se, 0, sizeof(struct sched_entity));
tg->cfs_rq[i] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
tg->se[i] = se;
se->cfs_rq = &rq->cfs;
se->my_q = cfs_rq;
se->load.weight = NICE_0_LOAD;
se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
se->parent = NULL;
}
tg->shares = NICE_0_LOAD;
/* Bind the container to task_grp object we just created */
tg->css.container = cont;
return &tg->css;
err:
for_each_possible_cpu(i) {
if (tg->cfs_rq && tg->cfs_rq[i])
kfree(tg->cfs_rq[i]);
if (tg->se && tg->se[i])
kfree(tg->se[i]);
}
if (tg->cfs_rq)
kfree(tg->cfs_rq);
if (tg->se)
kfree(tg->se);
if (tg)
kfree(tg);
return ERR_PTR(-ENOMEM);
}
/* destroy runqueue etc associated with a task group */
static void sched_destroy_group(struct container_subsys *ss,
struct container *cont)
{
struct task_grp *tg = container_tg(cont);
struct cfs_rq *cfs_rq;
struct sched_entity *se;
int i;
for_each_possible_cpu(i) {
cfs_rq = tg->cfs_rq[i];
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
}
/* wait for possible concurrent references to cfs_rqs complete */
synchronize_sched();
/* now it should be safe to free those cfs_rqs */
for_each_possible_cpu(i) {
cfs_rq = tg->cfs_rq[i];
kfree(cfs_rq);
se = tg->se[i];
kfree(se);
}
kfree(tg->cfs_rq);
kfree(tg->se);
kfree(tg);
}
static int sched_can_attach(struct container_subsys *ss,
struct container *cont, struct task_struct *tsk)
{
/* We don't support RT-tasks being in separate groups */
if (tsk->sched_class != &fair_sched_class)
return -EINVAL;
return 0;
}
/* change task's runqueue when it moves between groups */
static void sched_move_task(struct container_subsys *ss, struct container *cont,
struct container *old_cont, struct task_struct *tsk)
{
int on_rq, running;
unsigned long flags;
struct rq *rq;
rq = task_rq_lock(tsk, &flags);
if (tsk->sched_class != &fair_sched_class)
goto done;
update_rq_clock(rq);
running = task_running(rq, tsk);
on_rq = tsk->se.on_rq;
if (on_rq) {
dequeue_task(rq, tsk, 0);
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
}
set_task_cfs_rq(tsk);
if (on_rq) {
enqueue_task(rq, tsk, 0);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
}
done:
task_rq_unlock(rq, &flags);
}
static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
struct rq *rq = cfs_rq->rq;
int on_rq;
spin_lock_irq(&rq->lock);
on_rq = se->on_rq;
if (on_rq)
dequeue_entity(cfs_rq, se, 0);
se->load.weight = shares;
se->load.inv_weight = div64_64((1ULL<<32), shares);
if (on_rq)
enqueue_entity(cfs_rq, se, 0);
spin_unlock_irq(&rq->lock);
}
static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype,
struct file *file, const char __user *userbuf,
size_t nbytes, loff_t *ppos)
{
int i;
unsigned long shareval;
struct task_grp *tg = container_tg(cont);
char buffer[2*sizeof(unsigned long) + 1];
if (nbytes > 2*sizeof(unsigned long)) /* safety check */
return -E2BIG;
if (copy_from_user(buffer, userbuf, nbytes))
return -EFAULT;
buffer[nbytes] = 0; /* nul-terminate */
shareval = simple_strtoul(buffer, NULL, 10);
tg->shares = shareval;
for_each_possible_cpu(i)
set_se_shares(tg->se[i], shareval);
return nbytes;
}
static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft)
{
struct task_grp *tg = container_tg(cont);
return (u64) tg->shares;
}
struct cftype cpuctl_share = {
.name = "shares",
.read_uint = cpu_shares_read_uint,
.write = cpu_shares_write,
};
static int sched_populate(struct container_subsys *ss, struct container *cont)
{
return container_add_file(cont, ss, &cpuctl_share);
}
struct container_subsys cpu_subsys = {
.name = "cpu",
.create = sched_create_group,
.destroy = sched_destroy_group,
.can_attach = sched_can_attach,
.attach = sched_move_task,
.populate = sched_populate,
.subsys_id = cpu_subsys_id,
.early_init = 1,
};
#endif /* CONFIG_FAIR_GROUP_SCHED */

View File

@ -610,8 +610,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
*/
static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
{
/* A later patch will take group into account */
return &cpu_rq(this_cpu)->cfs;
return cfs_rq->tg->cfs_rq[this_cpu];
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */

View File

@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
{
}
static void set_curr_task_idle(struct rq *rq)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@ -66,6 +70,7 @@ static struct sched_class idle_sched_class __read_mostly = {
.load_balance = load_balance_idle,
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
/* no .task_new for idle tasks */
};

View File

@ -218,6 +218,10 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
}
}
static void set_curr_task_rt(struct rq *rq)
{
}
static struct sched_class rt_sched_class __read_mostly = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
@ -230,5 +234,6 @@ static struct sched_class rt_sched_class __read_mostly = {
.load_balance = load_balance_rt,
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
};