Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (30 commits)
  sched: Change wait_for_completion_*_timeout() to return a signed long
  sched, autogroup: Fix reference leak
  sched, autogroup: Fix potential access to freed memory
  sched: Remove redundant CONFIG_CGROUP_SCHED ifdef
  sched: Fix interactivity bug by charging unaccounted run-time on entity re-weight
  sched: Move periodic share updates to entity_tick()
  printk: Use this_cpu_{read|write} api on printk_pending
  sched: Make pushable_tasks CONFIG_SMP dependant
  sched: Add 'autogroup' scheduling feature: automated per session task groups
  sched: Fix unregister_fair_sched_group()
  sched: Remove unused argument dest_cpu to migrate_task()
  mutexes, sched: Introduce arch_mutex_cpu_relax()
  sched: Add some clock info to sched_debug
  cpu: Remove incorrect BUG_ON
  cpu: Remove unused variable
  sched: Fix UP build breakage
  sched: Make task dump print all 15 chars of proc comm
  sched: Update tg->shares after cpu.shares write
  sched: Allow update_cfs_load() to update global load
  sched: Implement demand based update_cfs_load()
  ...
This commit is contained in:
Linus Torvalds 2011-01-06 10:23:33 -08:00
commit 65b2074f84
29 changed files with 945 additions and 612 deletions

View File

@ -1614,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file
noapic [SMP,APIC] Tells the kernel to not make use of any
IOAPICs that may be present in the system.
noautogroup Disable scheduler automatic task group creation.
nobats [PPC] Do not use BATs for mapping kernel lowmem
on "Classic" PPC cores.

View File

@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
config HAVE_ARCH_JUMP_LABEL
bool
config HAVE_ARCH_MUTEX_CPU_RELAX
bool
source "kernel/gcov/Kconfig"

View File

@ -99,6 +99,7 @@ config S390
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_LZO
select HAVE_GET_USER_PAGES_FAST
select HAVE_ARCH_MUTEX_CPU_RELAX
select ARCH_INLINE_SPIN_TRYLOCK
select ARCH_INLINE_SPIN_TRYLOCK_BH
select ARCH_INLINE_SPIN_LOCK

View File

@ -7,3 +7,5 @@
*/
#include <asm-generic/mutex-dec.h>
#define arch_mutex_cpu_relax() barrier()

View File

@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
/*
* Print out autogroup related information:
*/
static int sched_autogroup_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
struct task_struct *p;
p = get_proc_task(inode);
if (!p)
return -ESRCH;
proc_sched_autogroup_show_task(p, m);
put_task_struct(p);
return 0;
}
static ssize_t
sched_autogroup_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
struct inode *inode = file->f_path.dentry->d_inode;
struct task_struct *p;
char buffer[PROC_NUMBUF];
long nice;
int err;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
err = strict_strtol(strstrip(buffer), 0, &nice);
if (err)
return -EINVAL;
p = get_proc_task(inode);
if (!p)
return -ESRCH;
err = nice;
err = proc_sched_autogroup_set_nice(p, &err);
if (err)
count = err;
put_task_struct(p);
return count;
}
static int sched_autogroup_open(struct inode *inode, struct file *filp)
{
int ret;
ret = single_open(filp, sched_autogroup_show, NULL);
if (!ret) {
struct seq_file *m = filp->private_data;
m->private = inode;
}
return ret;
}
static const struct file_operations proc_pid_sched_autogroup_operations = {
.open = sched_autogroup_open,
.read = seq_read,
.write = sched_autogroup_write,
.llseek = seq_lseek,
.release = single_release,
};
#endif /* CONFIG_SCHED_AUTOGROUP */
static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
@ -2732,6 +2808,9 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK

View File

@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x);
extern int wait_for_completion_killable(struct completion *x);
extern unsigned long wait_for_completion_timeout(struct completion *x,
unsigned long timeout);
extern unsigned long wait_for_completion_interruptible_timeout(
struct completion *x, unsigned long timeout);
extern unsigned long wait_for_completion_killable_timeout(
struct completion *x, unsigned long timeout);
extern long wait_for_completion_interruptible_timeout(
struct completion *x, unsigned long timeout);
extern long wait_for_completion_killable_timeout(
struct completion *x, unsigned long timeout);
extern bool try_wait_for_completion(struct completion *x);
extern bool completion_done(struct completion *x);

View File

@ -12,6 +12,13 @@
#include <linux/securebits.h>
#include <net/net_namespace.h>
#ifdef CONFIG_SMP
# define INIT_PUSHABLE_TASKS(tsk) \
.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
#else
# define INIT_PUSHABLE_TASKS(tsk)
#endif
extern struct files_struct init_files;
extern struct fs_struct init_fs;
@ -144,7 +151,7 @@ extern struct cred init_cred;
.nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
INIT_PUSHABLE_TASKS(tsk) \
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
.real_parent = &tsk, \

View File

@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
extern void mutex_unlock(struct mutex *lock);
extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
#define arch_mutex_cpu_relax() cpu_relax()
#endif
#endif

View File

@ -513,6 +513,8 @@ struct thread_group_cputimer {
spinlock_t lock;
};
struct autogroup;
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
@ -580,6 +582,9 @@ struct signal_struct {
struct tty_struct *tty; /* NULL if no tty */
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
/*
* Cumulative resource counters for dead threads in the group,
* and for reaped dead child processes forked by this group.
@ -1242,7 +1247,9 @@ struct task_struct {
#endif
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
#endif
struct mm_struct *mm, *active_mm;
#if defined(SPLIT_RSS_COUNTING)
@ -1883,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#ifdef CONFIG_HOTPLUG_CPU
extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
extern void idle_task_exit(void);
#else
static inline void idle_task_exit(void) {}
#endif
extern void sched_idle_next(void);
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
extern void wake_up_idle_cpu(int cpu);
#else
@ -1900,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_shares_ratelimit;
extern unsigned int sysctl_sched_shares_thresh;
extern unsigned int sysctl_sched_child_runs_first;
enum sched_tunable_scaling {
@ -1917,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;
extern unsigned int sysctl_sched_shares_window;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
@ -1942,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
extern unsigned int sysctl_sched_compat_yield;
#ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled;
extern void sched_autogroup_create_attach(struct task_struct *p);
extern void sched_autogroup_detach(struct task_struct *p);
extern void sched_autogroup_fork(struct signal_struct *sig);
extern void sched_autogroup_exit(struct signal_struct *sig);
#ifdef CONFIG_PROC_FS
extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
#endif
#else
static inline void sched_autogroup_create_attach(struct task_struct *p) { }
static inline void sched_autogroup_detach(struct task_struct *p) { }
static inline void sched_autogroup_fork(struct signal_struct *sig) { }
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
#endif
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
@ -1960,9 +1981,10 @@ extern int task_nice(const struct task_struct *p);
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
extern int sched_setscheduler(struct task_struct *, int,
const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int,
struct sched_param *);
const struct sched_param *);
extern struct task_struct *idle_task(int cpu);
extern struct task_struct *curr_task(int cpu);
extern void set_curr_task(int cpu, struct task_struct *p);

View File

@ -794,6 +794,19 @@ config NET_NS
endif # NAMESPACES
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
select EVENTFD
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
help
This option optimizes the scheduler for common desktop workloads by
automatically creating and populating task groups. This separation
of workloads isolates aggressive CPU burners (like build jobs) from
desktop applications. Task group autogeneration is currently based
upon task session.
config MM_OWNER
bool

View File

@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
}
struct take_cpu_down_param {
struct task_struct *caller;
unsigned long mod;
void *hcpu;
};
@ -198,7 +197,6 @@ struct take_cpu_down_param {
static int __ref take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
unsigned int cpu = (unsigned long)param->hcpu;
int err;
/* Ensure this CPU doesn't handle any more interrupts. */
@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
cpu_notify(CPU_DYING | param->mod, param->hcpu);
if (task_cpu(param->caller) == cpu)
move_task_off_dead_cpu(cpu, param->caller);
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
return 0;
}
@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
.caller = current,
.mod = mod,
.hcpu = hcpu,
};
@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
}
BUG_ON(cpu_online(cpu));
/* Wait for it to sleep (leaving idle task). */
/*
* The migration_call() CPU_DYING callback will have removed all
* runnable tasks from the cpu, there's only the idle task left now
* that the migration thread is done doing the stop_machine thing.
*
* Wait for the stop thread to go away.
*/
while (!idle_cpu(cpu))
yield();
cpu_relax();
/* This actually kills the CPU. */
__cpu_die(cpu);

View File

@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
static inline void put_signal_struct(struct signal_struct *sig)
{
if (atomic_dec_and_test(&sig->sigcnt))
if (atomic_dec_and_test(&sig->sigcnt)) {
sched_autogroup_exit(sig);
free_signal_struct(sig);
}
}
void __put_task_struct(struct task_struct *tsk)
@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
posix_cpu_timers_init_group(sig);
tty_audit_fork(sig);
sched_autogroup_fork(sig);
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
}
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
put_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:

View File

@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
*/
static int irq_thread(void *data)
{
struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
static struct sched_param param = {
.sched_priority = MAX_USER_RT_PRIO/2,
};
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
int wake, oneshot = desc->status & IRQ_ONESHOT;

View File

@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
wait_for_completion(&create.done);
if (!IS_ERR(create.result)) {
struct sched_param param = { .sched_priority = 0 };
static struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);

View File

@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
cpu_relax();
arch_mutex_cpu_relax();
}
#endif
spin_lock_mutex(&lock->wait_lock, flags);

View File

@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
void printk_tick(void)
{
if (__get_cpu_var(printk_pending)) {
__get_cpu_var(printk_pending) = 0;
if (__this_cpu_read(printk_pending)) {
__this_cpu_write(printk_pending, 0);
wake_up_interruptible(&log_wait);
}
}
int printk_needs_cpu(int cpu)
{
if (unlikely(cpu_is_offline(cpu)))
if (cpu_is_offline(cpu))
printk_tick();
return per_cpu(printk_pending, cpu);
return __this_cpu_read(printk_pending);
}
void wake_up_klogd(void)

View File

@ -75,9 +75,11 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/mutex.h>
#include "sched_cpupri.h"
#include "workqueue_sched.h"
#include "sched_autogroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@ -253,6 +255,8 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
atomic_t load_weight;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@ -268,24 +272,19 @@ struct task_group {
struct task_group *parent;
struct list_head siblings;
struct list_head children;
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
};
#define root_task_group init_task_group
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
*/
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
static int root_task_group_empty(void)
{
return list_empty(&root_task_group.children);
}
#endif
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
/*
@ -342,6 +341,7 @@ struct cfs_rq {
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
@ -360,14 +360,17 @@ struct cfs_rq {
unsigned long h_load;
/*
* this cpu's part of tg->shares
* Maintaining per-cpu shares distribution for group scheduling
*
* load_stamp is the last time we updated the load average
* load_last is the last time we updated the load average and saw load
* load_unacc_exec_time is currently unaccounted execution time
*/
unsigned long shares;
u64 load_avg;
u64 load_period;
u64 load_stamp, load_last, load_unacc_exec_time;
/*
* load.weight at the time we set shares
*/
unsigned long rq_weight;
unsigned long load_contribution;
#endif
#endif
};
@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
*/
static inline struct task_group *task_group(struct task_struct *p)
{
struct task_group *tg;
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
return container_of(css, struct task_group, css);
tg = container_of(css, struct task_group, css);
return autogroup_task_group(p, tg);
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@ -792,20 +798,6 @@ late_initcall(sched_init_debug);
*/
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
* ratelimit for updating the group shares.
* default: 0.25ms
*/
unsigned int sysctl_sched_shares_ratelimit = 250000;
unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
/*
* Inject some fuzzyness into changing the per-cpu group shares
* this avoids remote rq-locks at the expense of fairness.
* default: 4
*/
unsigned int sysctl_sched_shares_thresh = 4;
/*
* period over which we average the RT time consumption, measured
* in ms.
@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
lw->inv_weight = 0;
}
static inline void update_load_set(struct load_weight *lw, unsigned long w)
{
lw->weight = w;
lw->inv_weight = 0;
}
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
static __read_mostly unsigned long __percpu *update_shares_data;
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
* Calculate and set the cpu's group shares.
*/
static void update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares,
unsigned long sd_rq_weight,
unsigned long *usd_rq_weight)
{
unsigned long shares, rq_weight;
int boost = 0;
rq_weight = usd_rq_weight[cpu];
if (!rq_weight) {
boost = 1;
rq_weight = NICE_0_LOAD;
}
/*
* \Sum_j shares_j * rq_weight_i
* shares_i = -----------------------------
* \Sum_j rq_weight_j
*/
shares = (sd_shares * rq_weight) / sd_rq_weight;
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
if (abs(shares - tg->se[cpu]->load.weight) >
sysctl_sched_shares_thresh) {
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
__set_se_shares(tg->se[cpu], shares);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
}
/*
* Re-compute the task group their per cpu shares over the given domain.
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
unsigned long *usd_rq_weight;
struct sched_domain *sd = data;
unsigned long flags;
int i;
if (!tg->se[0])
return 0;
local_irq_save(flags);
usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
for_each_cpu(i, sched_domain_span(sd)) {
weight = tg->cfs_rq[i]->load.weight;
usd_rq_weight[i] = weight;
rq_weight += weight;
/*
* If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to
* run here it will not get delayed by group starvation.
*/
if (!weight)
weight = NICE_0_LOAD;
sum_weight += weight;
shares += tg->cfs_rq[i]->shares;
}
if (!rq_weight)
rq_weight = sum_weight;
if ((!shares && rq_weight) || shares > tg->shares)
shares = tg->shares;
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
shares = tg->shares;
for_each_cpu(i, sched_domain_span(sd))
update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
local_irq_restore(flags);
return 0;
}
/*
* Compute the cpu's hierarchical load factor for each task group.
* This needs to be done in a top-down fashion because the load of a child
@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
load = cpu_rq(cpu)->load.weight;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;
load *= tg->cfs_rq[cpu]->shares;
load *= tg->se[cpu]->load.weight;
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
}
@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
return 0;
}
static void update_shares(struct sched_domain *sd)
{
s64 elapsed;
u64 now;
if (root_task_group_empty())
return;
now = local_clock();
elapsed = now - sd->last_update;
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
static void update_h_load(long cpu)
{
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
#else
static inline void update_shares(struct sched_domain *sd)
{
}
#endif
#ifdef CONFIG_PREEMPT
@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
{
#ifdef CONFIG_SMP
cfs_rq->shares = shares;
#endif
}
#endif
static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
#include "sched_autogroup.c"
#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
static bool migrate_task(struct task_struct *p, int dest_cpu)
static bool migrate_task(struct task_struct *p, struct rq *rq)
{
struct rq *rq = task_rq(p);
/*
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
return dest_cpu;
/* No more Mr. Nice Guy. */
if (unlikely(dest_cpu >= nr_cpu_ids)) {
dest_cpu = cpuset_cpus_allowed_fallback(p);
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
* leave kernel.
*/
if (p->mm && printk_ratelimit()) {
printk(KERN_INFO "process %d (%s) no "
"longer affine to cpu%d\n",
task_pid_nr(p), p->comm, cpu);
}
dest_cpu = cpuset_cpus_allowed_fallback(p);
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
* leave kernel.
*/
if (p->mm && printk_ratelimit()) {
printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
task_pid_nr(p), p->comm, cpu);
}
return dest_cpu;
@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
#endif
put_cpu();
}
@ -3549,7 +3418,7 @@ void sched_exec(void)
* select_task_rq() can race against ->cpus_allowed
*/
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
task_rq_unlock(rq, &flags);
@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
if (task_thread_info(rq->curr) != owner || need_resched())
return 0;
cpu_relax();
arch_mutex_cpu_relax();
}
return 1;
@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*/
unsigned long __sched
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
{
@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*/
unsigned long __sched
long __sched
wait_for_completion_killable_timeout(struct completion *x,
unsigned long timeout)
{
@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
}
static int __sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param, bool user)
const struct sched_param *param, bool user)
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
@ -5056,7 +4925,7 @@ recheck:
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, true);
}
@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* but our caller might not have that capability.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
struct sched_param *param)
const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, false);
}
@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
printk(KERN_INFO "%-13.13s %c", p->comm,
printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
@ -5754,7 +5623,6 @@ static void update_sysctl(void)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
SET_SYSCTL(sched_shares_ratelimit);
#undef SET_SYSCTL
}
@ -5830,7 +5698,7 @@ again:
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
if (migrate_task(p, dest_cpu)) {
if (migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, &flags);
@ -5912,96 +5780,6 @@ static int migration_cpu_stop(void *data)
}
#ifdef CONFIG_HOTPLUG_CPU
/*
* Figure out where task on dead CPU should go, use force if necessary.
*/
void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
struct rq *rq = cpu_rq(dead_cpu);
int needs_cpu, uninitialized_var(dest_cpu);
unsigned long flags;
local_irq_save(flags);
raw_spin_lock(&rq->lock);
needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
if (needs_cpu)
dest_cpu = select_fallback_rq(dead_cpu, p);
raw_spin_unlock(&rq->lock);
/*
* It can only fail if we race with set_cpus_allowed(),
* in the racer should migrate the task anyway.
*/
if (needs_cpu)
__migrate_task(p, dead_cpu, dest_cpu);
local_irq_restore(flags);
}
/*
* While a dead CPU has no uninterruptible tasks queued at this point,
* it might still have a nonzero ->nr_uninterruptible counter, because
* for performance reasons the counter is not stricly tracking tasks to
* their home CPUs. So we just add the counter to another CPU's counter,
* to keep the global sum constant after CPU-down:
*/
static void migrate_nr_uninterruptible(struct rq *rq_src)
{
struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
unsigned long flags;
local_irq_save(flags);
double_rq_lock(rq_src, rq_dest);
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
rq_src->nr_uninterruptible = 0;
double_rq_unlock(rq_src, rq_dest);
local_irq_restore(flags);
}
/* Run through task list and migrate tasks from the dead cpu. */
static void migrate_live_tasks(int src_cpu)
{
struct task_struct *p, *t;
read_lock(&tasklist_lock);
do_each_thread(t, p) {
if (p == current)
continue;
if (task_cpu(p) == src_cpu)
move_task_off_dead_cpu(src_cpu, p);
} while_each_thread(t, p);
read_unlock(&tasklist_lock);
}
/*
* Schedules idle task to be the next runnable task on current CPU.
* It does so by boosting its priority to highest possible.
* Used by CPU offline code.
*/
void sched_idle_next(void)
{
int this_cpu = smp_processor_id();
struct rq *rq = cpu_rq(this_cpu);
struct task_struct *p = rq->idle;
unsigned long flags;
/* cpu has to be offline */
BUG_ON(cpu_online(this_cpu));
/*
* Strictly not necessary since rest of the CPUs are stopped by now
* and interrupts disabled on the current cpu.
*/
raw_spin_lock_irqsave(&rq->lock, flags);
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
activate_task(rq, p, 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
/*
* Ensures that the idle task is using init_mm right before its cpu goes
@ -6018,47 +5796,19 @@ void idle_task_exit(void)
mmdrop(mm);
}
/* called under rq->lock with disabled interrupts */
static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
/*
* While a dead CPU has no uninterruptible tasks queued at this point,
* it might still have a nonzero ->nr_uninterruptible counter, because
* for performance reasons the counter is not stricly tracking tasks to
* their home CPUs. So we just add the counter to another CPU's counter,
* to keep the global sum constant after CPU-down:
*/
static void migrate_nr_uninterruptible(struct rq *rq_src)
{
struct rq *rq = cpu_rq(dead_cpu);
struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
/* Must be exiting, otherwise would be on tasklist. */
BUG_ON(!p->exit_state);
/* Cannot have done final schedule yet: would have vanished. */
BUG_ON(p->state == TASK_DEAD);
get_task_struct(p);
/*
* Drop lock around migration; if someone else moves it,
* that's OK. No task can be added to this CPU, so iteration is
* fine.
*/
raw_spin_unlock_irq(&rq->lock);
move_task_off_dead_cpu(dead_cpu, p);
raw_spin_lock_irq(&rq->lock);
put_task_struct(p);
}
/* release_task() removes task from tasklist, so we won't find dead tasks. */
static void migrate_dead_tasks(unsigned int dead_cpu)
{
struct rq *rq = cpu_rq(dead_cpu);
struct task_struct *next;
for ( ; ; ) {
if (!rq->nr_running)
break;
next = pick_next_task(rq);
if (!next)
break;
next->sched_class->put_prev_task(rq, next);
migrate_dead(dead_cpu, next);
}
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
rq_src->nr_uninterruptible = 0;
}
/*
@ -6069,6 +5819,56 @@ static void calc_global_load_remove(struct rq *rq)
atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
rq->calc_load_active = 0;
}
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
*
* Called with rq->lock held even though we'er in stop_machine() and
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
static void migrate_tasks(unsigned int dead_cpu)
{
struct rq *rq = cpu_rq(dead_cpu);
struct task_struct *next, *stop = rq->stop;
int dest_cpu;
/*
* Fudge the rq selection such that the below task selection loop
* doesn't get stuck on the currently eligible stop task.
*
* We're currently inside stop_machine() and the rq is either stuck
* in the stop_machine_cpu_stop() loop, or we're executing this code,
* either way we should never end up calling schedule() until we're
* done here.
*/
rq->stop = NULL;
for ( ; ; ) {
/*
* There's this thread running, bail when that's the only
* remaining thread.
*/
if (rq->nr_running == 1)
break;
next = pick_next_task(rq);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_cpu, next);
raw_spin_unlock(&rq->lock);
__migrate_task(next, dead_cpu, dest_cpu);
raw_spin_lock(&rq->lock);
}
rq->stop = stop;
}
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
unsigned long flags;
struct rq *rq = cpu_rq(cpu);
switch (action) {
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
case CPU_DEAD_FROZEN:
migrate_live_tasks(cpu);
/* Idle task back to normal (off runqueue, low prio) */
raw_spin_lock_irq(&rq->lock);
deactivate_task(rq, rq->idle, 0);
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
rq->idle->sched_class = &idle_sched_class;
migrate_dead_tasks(cpu);
raw_spin_unlock_irq(&rq->lock);
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
calc_global_load_remove(rq);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
migrate_tasks(cpu);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
migrate_nr_uninterruptible(rq);
calc_global_load_remove(rq);
break;
#endif
}
@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu, int add,
struct sched_entity *se, int cpu,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
tg->cfs_rq[cpu] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
if (add)
list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
tg->se[cpu] = se;
/* se could be NULL for init_task_group */
@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
se->load.weight = tg->shares;
se->load.inv_weight = 0;
update_load_set(&se->load, 0);
se->parent = parent;
}
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu, int add,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
init_rt_rq(rt_rq, rq);
rt_rq->tg = tg;
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
if (add)
list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
tg->rt_se[cpu] = rt_se;
if (!rt_se)
@ -8164,13 +7946,9 @@ void __init sched_init(void)
#ifdef CONFIG_CGROUP_SCHED
list_add(&init_task_group.list, &task_groups);
INIT_LIST_HEAD(&init_task_group.children);
autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
__alignof__(unsigned long));
#endif
for_each_possible_cpu(i) {
struct rq *rq;
@ -8184,7 +7962,6 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.shares = init_task_group_load;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
#ifdef CONFIG_CGROUP_SCHED
/*
* How much cpu bandwidth does init_task_group get?
*
@ -8204,16 +7981,13 @@ void __init sched_init(void)
* We achieve this by letting init_task_group's tasks sit
* directly in rq->cfs (i.e init_task_group->se[] = NULL).
*/
init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
#endif
init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
#ifdef CONFIG_CGROUP_SCHED
init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
#endif
init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@ -8486,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!se)
goto err_free_rq;
init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
return 1;
@ -8497,15 +8271,21 @@ err:
return 0;
}
static inline void register_fair_sched_group(struct task_group *tg, int cpu)
{
list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
&cpu_rq(cpu)->leaf_cfs_rq_list);
}
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
if (!tg->cfs_rq[cpu]->on_list)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#else /* !CONFG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
@ -8518,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
static inline void register_fair_sched_group(struct task_group *tg, int cpu)
{
}
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
}
@ -8576,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!rt_se)
goto err_free_rq;
init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
return 1;
@ -8586,17 +8362,6 @@ err_free_rq:
err:
return 0;
}
static inline void register_rt_sched_group(struct task_group *tg, int cpu)
{
list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
&cpu_rq(cpu)->leaf_rt_rq_list);
}
static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
{
list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
}
#else /* !CONFIG_RT_GROUP_SCHED */
static inline void free_rt_sched_group(struct task_group *tg)
{
@ -8607,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
static inline void register_rt_sched_group(struct task_group *tg, int cpu)
{
}
static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
{
}
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
@ -8630,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
unsigned long flags;
int i;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@ -8643,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
goto err;
spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i) {
register_fair_sched_group(tg, i);
register_rt_sched_group(tg, i);
}
list_add_rcu(&tg->list, &task_groups);
WARN_ON(!parent); /* root should already exist */
@ -8676,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
unsigned long flags;
int i;
spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i) {
/* end participation in shares distribution */
for_each_possible_cpu(i)
unregister_fair_sched_group(tg, i);
unregister_rt_sched_group(tg, i);
}
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
@ -8727,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
static void __set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
int on_rq;
on_rq = se->on_rq;
if (on_rq)
dequeue_entity(cfs_rq, se, 0);
se->load.weight = shares;
se->load.inv_weight = 0;
if (on_rq)
enqueue_entity(cfs_rq, se, 0);
}
static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
struct rq *rq = cfs_rq->rq;
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
__set_se_shares(se, shares);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@ -8776,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
if (tg->shares == shares)
goto done;
spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i)
unregister_fair_sched_group(tg, i);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
/* wait for any ongoing reference to this group to finish */
synchronize_sched();
/*
* Now we are free to modify the group's share on each cpu
* w/o tripping rebalance_share or load_balance_fair.
*/
tg->shares = shares;
for_each_possible_cpu(i) {
/*
* force a rebalance
*/
cfs_rq_set_shares(tg->cfs_rq[i], 0);
set_se_shares(tg->se[i], shares);
struct rq *rq = cpu_rq(i);
struct sched_entity *se;
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
for_each_sched_entity(se)
update_cfs_shares(group_cfs_rq(se), 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
/*
* Enable load balance activity on this group, by inserting it back on
* each cpu's rq->leaf_cfs_rq_list.
*/
spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i)
register_fair_sched_group(tg, i);
list_add_rcu(&tg->siblings, &tg->parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
done:
mutex_unlock(&shares_mutex);
return 0;

238
kernel/sched_autogroup.c Normal file
View File

@ -0,0 +1,238 @@
#ifdef CONFIG_SCHED_AUTOGROUP
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
static void autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &init_task_group;
init_task_group.autogroup = &autogroup_default;
kref_init(&autogroup_default.kref);
init_rwsem(&autogroup_default.lock);
init_task->signal->autogroup = &autogroup_default;
}
static inline void autogroup_free(struct task_group *tg)
{
kfree(tg->autogroup);
}
static inline void autogroup_destroy(struct kref *kref)
{
struct autogroup *ag = container_of(kref, struct autogroup, kref);
sched_destroy_group(ag->tg);
}
static inline void autogroup_kref_put(struct autogroup *ag)
{
kref_put(&ag->kref, autogroup_destroy);
}
static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
{
kref_get(&ag->kref);
return ag;
}
static inline struct autogroup *autogroup_task_get(struct task_struct *p)
{
struct autogroup *ag;
unsigned long flags;
if (!lock_task_sighand(p, &flags))
return autogroup_kref_get(&autogroup_default);
ag = autogroup_kref_get(p->signal->autogroup);
unlock_task_sighand(p, &flags);
return ag;
}
static inline struct autogroup *autogroup_create(void)
{
struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
struct task_group *tg;
if (!ag)
goto out_fail;
tg = sched_create_group(&init_task_group);
if (IS_ERR(tg))
goto out_free;
kref_init(&ag->kref);
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
ag->tg = tg;
tg->autogroup = ag;
return ag;
out_free:
kfree(ag);
out_fail:
if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n",
ag ? "sched_create_group()" : "kmalloc()");
}
return autogroup_kref_get(&autogroup_default);
}
static inline bool
task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
if (p->sched_class != &fair_sched_class)
return false;
/*
* We can only assume the task group can't go away on us if
* autogroup_move_group() can see us on ->thread_group list.
*/
if (p->flags & PF_EXITING)
return false;
return true;
}
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
return tg;
}
static void
autogroup_move_group(struct task_struct *p, struct autogroup *ag)
{
struct autogroup *prev;
struct task_struct *t;
unsigned long flags;
BUG_ON(!lock_task_sighand(p, &flags));
prev = p->signal->autogroup;
if (prev == ag) {
unlock_task_sighand(p, &flags);
return;
}
p->signal->autogroup = autogroup_kref_get(ag);
t = p;
do {
sched_move_task(t);
} while_each_thread(p, t);
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
/* Allocates GFP_KERNEL, cannot be called under any spinlock */
void sched_autogroup_create_attach(struct task_struct *p)
{
struct autogroup *ag = autogroup_create();
autogroup_move_group(p, ag);
/* drop extra refrence added by autogroup_create() */
autogroup_kref_put(ag);
}
EXPORT_SYMBOL(sched_autogroup_create_attach);
/* Cannot be called under siglock. Currently has no users */
void sched_autogroup_detach(struct task_struct *p)
{
autogroup_move_group(p, &autogroup_default);
}
EXPORT_SYMBOL(sched_autogroup_detach);
void sched_autogroup_fork(struct signal_struct *sig)
{
sig->autogroup = autogroup_task_get(current);
}
void sched_autogroup_exit(struct signal_struct *sig)
{
autogroup_kref_put(sig->autogroup);
}
static int __init setup_autogroup(char *str)
{
sysctl_sched_autogroup_enabled = 0;
return 1;
}
__setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS
int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
{
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
int err;
if (*nice < -20 || *nice > 19)
return -EINVAL;
err = security_task_setnice(current, *nice);
if (err)
return err;
if (*nice < 0 && !can_nice(current, *nice))
return -EPERM;
/* this is a heavy operation taking global locks.. */
if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
return -EAGAIN;
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
down_write(&ag->lock);
err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
if (!err)
ag->nice = *nice;
up_write(&ag->lock);
autogroup_kref_put(ag);
return err;
}
void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
{
struct autogroup *ag = autogroup_task_get(p);
down_read(&ag->lock);
seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
up_read(&ag->lock);
autogroup_kref_put(ag);
}
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
#endif /* CONFIG_SCHED_AUTOGROUP */

32
kernel/sched_autogroup.h Normal file
View File

@ -0,0 +1,32 @@
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
struct kref kref;
struct task_group *tg;
struct rw_semaphore lock;
unsigned long id;
int nice;
};
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg);
#else /* !CONFIG_SCHED_AUTOGROUP */
static inline void autogroup_init(struct task_struct *init_task) { }
static inline void autogroup_free(struct task_group *tg) { }
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
return tg;
}
#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
return 0;
}
#endif
#endif /* CONFIG_SCHED_AUTOGROUP */

View File

@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
}
EXPORT_SYMBOL_GPL(sched_clock);
static __read_mostly int sched_clock_running;
__read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
__read_mostly int sched_clock_stable;

View File

@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu,
struct task_group *tg)
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
if (!se)
@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
#ifdef CONFIG_CGROUP_SCHED
{
char path[64];
rcu_read_lock();
cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
rcu_read_unlock();
SEQ_printf(m, " %s", path);
}
#endif
SEQ_printf(m, "\n");
}
@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_unlock_irqrestore(&tasklist_lock, flags);
}
#if defined(CONFIG_CGROUP_SCHED) && \
(defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
static void task_group_path(struct task_group *tg, char *buf, int buflen)
{
/* may be NULL if the underlying cgroup isn't fully-created yet */
if (!tg->css.cgroup) {
buf[0] = '\0';
return;
}
cgroup_path(tg->css.cgroup, buf, buflen);
}
#endif
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
struct sched_entity *last;
unsigned long flags;
#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
char path[128];
struct task_group *tg = cfs_rq->tg;
task_group_path(tg, path, sizeof(path));
SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
#else
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
spread0 = min_vruntime - rq0_min_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
SPLIT_NS(cfs_rq->load_avg));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
SPLIT_NS(cfs_rq->load_period));
SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
cfs_rq->load_contribution);
SEQ_printf(m, " .%-30s: %d\n", "load_tg",
atomic_read(&cfs_rq->tg->load_weight));
#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
char path[128];
struct task_group *tg = rt_rq->tg;
task_group_path(tg, path, sizeof(path));
SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
#else
SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
#endif
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
#undef P
}
extern __read_mostly int sched_clock_running;
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
static int sched_debug_show(struct seq_file *m, void *v)
{
u64 now = ktime_to_ns(ktime_get());
u64 ktime, sched_clk, cpu_clk;
unsigned long flags;
int cpu;
SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
local_irq_save(flags);
ktime = ktime_to_ns(ktime_get());
sched_clk = sched_clock();
cpu_clk = local_clock();
local_irq_restore(flags);
SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
#define P(x) \
SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
PN(ktime);
PN(sched_clk);
PN(cpu_clk);
P(jiffies);
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
P(sched_clock_stable);
#endif
#undef PN
#undef P
SEQ_printf(m, "\n");
SEQ_printf(m, "sysctl_sched\n");
#define P(x) \
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
P(jiffies);
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity);

View File

@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
/*
* The exponential sliding window over which load is averaged for shares
* distribution.
* (default: 10msec)
*/
unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
static const struct sched_class fair_sched_class;
/**************************************************************
@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
return cfs_rq->tg->cfs_rq[this_cpu];
}
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
* enqueued. The fact that we always enqueue bottom-up
* reduces this to two cases.
*/
if (cfs_rq->tg->parent &&
cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq_of(cfs_rq)->leaf_cfs_rq_list);
} else {
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq_of(cfs_rq)->leaf_cfs_rq_list);
}
cfs_rq->on_list = 1;
}
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
return &cpu_rq(this_cpu)->cfs;
}
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
WRT_SYSCTL(sched_shares_ratelimit);
#undef WRT_SYSCTL
return 0;
@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
cfs_rq->load_unacc_exec_time += delta_exec;
#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
se->on_rq = 1;
}
static void
@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
se->on_rq = 0;
}
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
int global_update)
{
struct task_group *tg = cfs_rq->tg;
long load_avg;
load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
load_avg -= cfs_rq->load_contribution;
if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
atomic_add(load_avg, &tg->load_weight);
cfs_rq->load_contribution += load_avg;
}
}
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
u64 period = sysctl_sched_shares_window;
u64 now, delta;
unsigned long load = cfs_rq->load.weight;
if (!cfs_rq)
return;
now = rq_of(cfs_rq)->clock;
delta = now - cfs_rq->load_stamp;
/* truncate load history at 4 idle periods */
if (cfs_rq->load_stamp > cfs_rq->load_last &&
now - cfs_rq->load_last > 4 * period) {
cfs_rq->load_period = 0;
cfs_rq->load_avg = 0;
}
cfs_rq->load_stamp = now;
cfs_rq->load_unacc_exec_time = 0;
cfs_rq->load_period += delta;
if (load) {
cfs_rq->load_last = now;
cfs_rq->load_avg += delta * load;
}
/* consider updating load contribution on each fold or truncate */
if (global_update || cfs_rq->load_period > period
|| !cfs_rq->load_period)
update_cfs_rq_load_contribution(cfs_rq, global_update);
while (cfs_rq->load_period > period) {
/*
* Inline assembly required to prevent the compiler
* optimising this loop into a divmod call.
* See __iter_div_u64_rem() for another example of this.
*/
asm("" : "+rm" (cfs_rq->load_period));
cfs_rq->load_period /= 2;
cfs_rq->load_avg /= 2;
}
if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
list_del_leaf_cfs_rq(cfs_rq);
}
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
account_entity_dequeue(cfs_rq, se);
}
update_load_set(&se->load, weight);
if (se->on_rq)
account_entity_enqueue(cfs_rq, se);
}
static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
{
struct task_group *tg;
struct sched_entity *se;
long load_weight, load, shares;
if (!cfs_rq)
return;
tg = cfs_rq->tg;
se = tg->se[cpu_of(rq_of(cfs_rq))];
if (!se)
return;
load = cfs_rq->load.weight + weight_delta;
load_weight = atomic_read(&tg->load_weight);
load_weight -= cfs_rq->load_contribution;
load_weight += load;
shares = (tg->shares * load);
if (load_weight)
shares /= load_weight;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
if (shares > tg->shares)
shares = tg->shares;
reweight_entity(cfs_rq_of(se), se, shares);
}
static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq, 0);
}
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
}
static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
{
}
static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq, se->load.weight);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP) {
@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
if (cfs_rq->nr_running == 1)
list_add_leaf_cfs_rq(cfs_rq);
}
static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq, 0);
/*
* Normalize the entity after updating the min_vruntime because the
@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
*/
update_curr(cfs_rq);
/*
* Update share accounting for long-running entities.
*/
update_entity_shares_tick(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq, 0);
}
hrtick_update(rq);
}
@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
flags |= DEQUEUE_SLEEP;
}
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq, 0);
}
hrtick_update(rq);
}
@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
*
* The problem is that perfectly aligning the shares is rather expensive, hence
* we try to avoid doing that too often - see update_shares(), which ratelimits
* this change.
*
* We compensate this by not only taking the current delta into account, but
* also considering the delta between when the shares were last adjusted and
* now.
*
* We still saw a performance dip, some tracing learned us that between
* cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
* significantly. Therefore try to bias the error in direction of failing
* the affine wakeup.
*
*/
static long effective_load(struct task_group *tg, int cpu,
long wl, long wg)
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
if (!tg->parent)
return wl;
/*
* By not taking the decrease of shares on the other cpu into
* account our error leans towards reducing the affine wakeups.
*/
if (!wl && sched_feat(ASYM_EFF_LOAD))
return wl;
for_each_sched_entity(se) {
long S, rw, s, a, b;
long more_w;
/*
* Instead of using this increment, also add the difference
* between when the shares were last updated and now.
*/
more_w = se->my_q->load.weight - se->my_q->rq_weight;
wl += more_w;
wg += more_w;
S = se->my_q->tg->shares;
s = se->my_q->shares;
rw = se->my_q->rq_weight;
s = se->load.weight;
rw = se->my_q->load.weight;
a = S*(rw + wl);
b = S*rw + s*wg;
@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
sd = tmp;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
if (sched_feat(LB_SHARES_UPDATE)) {
/*
* Pick the largest domain to update shares over
*/
tmp = sd;
if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
tmp = affine_sd;
if (tmp) {
raw_spin_unlock(&rq->lock);
update_shares(tmp);
raw_spin_lock(&rq->lock);
}
}
#endif
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu);
@ -1909,6 +2071,48 @@ out:
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* update tg->load_weight by folding this cpu's load_avg
*/
static int update_shares_cpu(struct task_group *tg, int cpu)
{
struct cfs_rq *cfs_rq;
unsigned long flags;
struct rq *rq;
if (!tg->se[cpu])
return 0;
rq = cpu_rq(cpu);
cfs_rq = tg->cfs_rq[cpu];
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
update_cfs_load(cfs_rq, 1);
/*
* We need to update shares after updating tg->load_weight in
* order to adjust the weight of groups with long running tasks.
*/
update_cfs_shares(cfs_rq, 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
return 0;
}
static void update_shares(int cpu)
{
struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
rcu_read_lock();
for_each_leaf_cfs_rq(rq, cfs_rq)
update_shares_cpu(cfs_rq->tg, cpu);
rcu_read_unlock();
}
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
return max_load_move - rem_load_move;
}
#else
static inline void update_shares(int cpu)
{
}
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
schedstat_inc(sd, lb_count[idle]);
redo:
update_shares(sd);
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
cpus, balance);
@ -3174,8 +3381,6 @@ out_one_pinned:
else
ld_moved = 0;
out:
if (ld_moved)
update_shares(sd);
return ld_moved;
}
@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
*/
raw_spin_unlock(&this_rq->lock);
update_shares(this_cpu);
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
int update_next_balance = 0;
int need_serialize;
update_shares(cpu);
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;

View File

@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_SHARES_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
/*
* Spin-wait on mutex acquisition when the mutex owner is running on

View File

@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
}
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{
list_add_rcu(&rt_rq->leaf_rt_rq_list,
&rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
}
static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
{
list_del_rcu(&rt_rq->leaf_rt_rq_list);
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
return ktime_to_ns(def_rt_bandwidth.rt_period);
}
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{
}
static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
{
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
if (!rt_rq->rt_nr_running)
list_add_leaf_rt_rq(rt_rq);
if (head)
list_add(&rt_se->run_list, queue);
else
@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
if (!rt_rq->rt_nr_running)
list_del_leaf_rt_rq(rt_rq);
}
/*

View File

@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
cpumask_any(cpu_online_mask));
case CPU_DEAD:
case CPU_DEAD_FROZEN: {
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
static struct sched_param param = {
.sched_priority = MAX_RT_PRIO-1
};
p = per_cpu(ksoftirqd, hotcpu);
per_cpu(ksoftirqd, hotcpu) = NULL;

View File

@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
err = session;
out:
write_unlock_irq(&tasklist_lock);
if (err > 0)
if (err > 0) {
proc_sid_connector(group_leader);
sched_autogroup_create_attach(group_leader);
}
return err;
}

View File

@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
static int min_sched_shares_ratelimit = 100000; /* 100 usec */
static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
#endif
#ifdef CONFIG_COMPACTION
@ -304,15 +302,6 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
{
.procname = "sched_shares_ratelimit",
.data = &sysctl_sched_shares_ratelimit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_shares_ratelimit,
.extra2 = &max_sched_shares_ratelimit,
},
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
@ -322,14 +311,6 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_sched_tunable_scaling,
.extra2 = &max_sched_tunable_scaling,
},
{
.procname = "sched_shares_thresh",
.data = &sysctl_sched_shares_thresh,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
{
.procname = "sched_migration_cost",
.data = &sysctl_sched_migration_cost,
@ -351,6 +332,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_shares_window",
.data = &sysctl_sched_shares_window,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "timer_migration",
.data = &sysctl_timer_migration,
@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
.data = &sysctl_sched_autogroup_enabled,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
.extra2 = &one,
},
#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",

View File

@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
static int trace_wakeup_test_thread(void *data)
{
/* Make this a RT thread, doesn't need to be too high */
struct sched_param param = { .sched_priority = 5 };
static struct sched_param param = { .sched_priority = 5 };
struct completion *x = data;
sched_setscheduler(current, SCHED_FIFO, &param);

View File

@ -309,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
static int watchdog(void *unused)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
sched_setscheduler(current, SCHED_FIFO, &param);