diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index eea2a6f72b31..1ef5e48b3a36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_info("kvm-stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); diff --git a/include/linux/sched.h b/include/linux/sched.h index 253538f29ade..d99218a1e043 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p); #define TASK_WAKING 256 #define TASK_PARKED 512 #define TASK_NOLOAD 1024 -#define TASK_STATE_MAX 2048 +#define TASK_NEW 2048 +#define TASK_STATE_MAX 4096 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; @@ -2139,6 +2140,9 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } +struct task_struct *task_rcu_dereference(struct task_struct **ptask); +struct task_struct *try_get_task_struct(struct task_struct **ptask); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime); diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..2fb4d44c51b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -210,6 +210,82 @@ repeat: goto repeat; } +/* + * Note that if this function returns a valid task_struct pointer (!NULL) + * task->usage must remain >0 for the duration of the RCU critical section. + */ +struct task_struct *task_rcu_dereference(struct task_struct **ptask) +{ + struct sighand_struct *sighand; + struct task_struct *task; + + /* + * We need to verify that release_task() was not called and thus + * delayed_put_task_struct() can't run and drop the last reference + * before rcu_read_unlock(). We check task->sighand != NULL, + * but we can read the already freed and reused memory. + */ +retry: + task = rcu_dereference(*ptask); + if (!task) + return NULL; + + probe_kernel_address(&task->sighand, sighand); + + /* + * Pairs with atomic_dec_and_test() in put_task_struct(). If this task + * was already freed we can not miss the preceding update of this + * pointer. + */ + smp_rmb(); + if (unlikely(task != READ_ONCE(*ptask))) + goto retry; + + /* + * We've re-checked that "task == *ptask", now we have two different + * cases: + * + * 1. This is actually the same task/task_struct. In this case + * sighand != NULL tells us it is still alive. + * + * 2. This is another task which got the same memory for task_struct. + * We can't know this of course, and we can not trust + * sighand != NULL. + * + * In this case we actually return a random value, but this is + * correct. + * + * If we return NULL - we can pretend that we actually noticed that + * *ptask was updated when the previous task has exited. Or pretend + * that probe_slab_address(&sighand) reads NULL. + * + * If we return the new task (because sighand is not NULL for any + * reason) - this is fine too. This (new) task can't go away before + * another gp pass. + * + * And note: We could even eliminate the false positive if re-read + * task->sighand once again to avoid the falsely NULL. But this case + * is very unlikely so we don't care. + */ + if (!sighand) + return NULL; + + return task; +} + +struct task_struct *try_get_task_struct(struct task_struct **ptask) +{ + struct task_struct *task; + + rcu_read_lock(); + task = task_rcu_dereference(ptask); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51d7105f529a..4e9617a7e7d9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2383,8 +2383,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); + init_entity_runnable_average(&p->se); /* * The child is not yet in the pid-hash so no cgroup attach races, @@ -2394,7 +2393,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2526,16 +2531,18 @@ void wake_up_new_task(struct task_struct *p) struct rq_flags rf; struct rq *rq; - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); @@ -3161,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif + if (panic_on_warn) + panic("scheduling while atomic\n"); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -4752,7 +4762,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -7231,7 +7242,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) struct rq *rq = cpu_rq(cpu); rq->calc_load_update = calc_load_update; - account_reset_rq(rq); update_max_interval(); } @@ -7711,6 +7721,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); + + online_fair_sched_group(tg); } /* rcu callback to free various structures associated with a task group */ @@ -7739,14 +7751,37 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. +static void sched_change_group(struct task_struct *tsk, int type) +{ + struct task_group *tg; + + /* + * All callers are synchronized by task_rq_lock(); we do not use RCU + * which is pointless here. Thus, we pass "true" to task_css_check() + * to prevent lockdep warnings. + */ + tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); + else +#endif + set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. */ void sched_move_task(struct task_struct *tsk) { - struct task_group *tg; int queued, running; struct rq_flags rf; struct rq *rq; @@ -7761,22 +7796,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) put_prev_task(rq, tsk); - /* - * All callers are synchronized by task_rq_lock(); we do not use RCU - * which is pointless here. Thus, we pass "true" to task_css_check() - * to prevent lockdep warnings. - */ - tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), - struct task_group, css); - tg = autogroup_task_group(tsk, tg); - tsk->sched_task_group = tg; - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); - else -#endif - set_task_rq(tsk, task_cpu(tsk)); + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8204,15 +8224,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task) { - sched_move_task(task); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &rf); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8223,8 +8255,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,15 +25,13 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; -enum cpuacct_usage_index { - CPUACCT_USAGE_USER, /* ... user mode */ - CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ - - CPUACCT_USAGE_NRUSAGE, +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", }; struct cpuacct_usage { - u64 usages[CPUACCT_USAGE_NRUSAGE]; + u64 usages[CPUACCT_STAT_NSTATS]; }; /* track cpu usage of a group of tasks and its child groups */ @@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) } static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; /* - * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * We allow index == CPUACCT_STAT_NSTATS here to read * the sum of suages. */ - BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + BUG_ON(index > CPUACCT_STAT_NSTATS); #ifndef CONFIG_64BIT /* @@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - if (index == CPUACCT_USAGE_NRUSAGE) { + if (index == CPUACCT_STAT_NSTATS) { int i = 0; data = 0; - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) data += cpuusage->usages[i]; } else { data = cpuusage->usages[index]; @@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpuusage->usages[i] = val; #ifndef CONFIG_64BIT @@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* return total cpu usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; @@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 cpuusage_user_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_USER); + return __cpuusage_read(css, CPUACCT_STAT_USER); } static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); + return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); } static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); + return __cpuusage_read(css, CPUACCT_STAT_NSTATS); } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, @@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, } static int __cpuacct_percpu_seq_show(struct seq_file *m, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; @@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); } static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); } static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } -static const char * const cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; +static int cpuacct_all_seq_show(struct seq_file *m, void *V) +{ + struct cpuacct *ca = css_ca(seq_css(m)); + int index; + int cpu; + + seq_puts(m, "cpu"); + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %s", cpuacct_stat_desc[index]); + seq_puts(m, "\n"); + + for_each_possible_cpu(cpu) { + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + + seq_printf(m, "%d", cpu); + + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit + * platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); +#endif + + seq_printf(m, " %llu", cpuusage->usages[index]); + +#ifndef CONFIG_64BIT + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#endif + } + seq_puts(m, "\n"); + } + return 0; +} static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); + s64 val[CPUACCT_STAT_NSTATS]; int cpu; - s64 val = 0; + int stat; + memset(val, 0, sizeof(val)); for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val = 0; - for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { + seq_printf(sf, "%s %lld\n", + cpuacct_stat_desc[stat], + cputime64_to_clock_t(val[stat])); + } return 0; } @@ -301,6 +329,10 @@ static struct cftype files[] = { .name = "usage_percpu_sys", .seq_show = cpuacct_percpu_sys_seq_show, }, + { + .name = "usage_all", + .seq_show = cpuacct_all_seq_show, + }, { .name = "stat", .seq_show = cpuacct_stats_show, @@ -316,11 +348,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_USAGE_SYSTEM; + int index = CPUACCT_STAT_SYSTEM; struct pt_regs *regs = task_pt_regs(tsk); if (regs && user_mode(regs)) - index = CPUACCT_USAGE_USER; + index = CPUACCT_STAT_USER; rcu_read_lock(); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..3d60e5d76fdb 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline bool steal_account_process_tick(void) +static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { @@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void) * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_jiffies = nsecs_to_jiffies(steal); + steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies); this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); account_steal_time(jiffies_to_cputime(steal_jiffies)); return steal_jiffies; } #endif - return false; + return 0; } /* @@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; - if (steal_account_process_tick()) + if (steal_account_process_tick(ULONG_MAX)) return; cputime *= ticks; @@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick()) + if (steal_account_process_tick(ULONG_MAX)) return; if (user_tick) @@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta = now - tsk->vtime_snap; + unsigned long delta_jiffies, steal_jiffies; + delta_jiffies = now - tsk->vtime_snap; + steal_jiffies = steal_account_process_tick(delta_jiffies); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta); + return jiffies_to_cputime(delta_jiffies - steal_jiffies); } static void __vtime_account_system(struct task_struct *tsk) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a336..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; -#ifdef CONFIG_SCHEDSTATS P(se.nr_migrations); +#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c8c5d2d48424..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,6 +690,11 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -720,6 +725,8 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -733,16 +740,42 @@ void post_init_entity_util_avg(struct sched_entity *se) } sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = now; + return; + } + } + + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } void post_init_entity_util_avg(struct sched_entity *se) { } -#endif +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -1303,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); + if (p) + get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1370,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; - bool assigned = false; rcu_read_lock(); - - raw_spin_lock_irq(&dst_rq->lock); - cur = dst_rq->curr; - /* - * No need to move the exiting task or idle task. - */ - if ((cur->flags & PF_EXITING) || is_idle_task(cur)) + cur = task_rcu_dereference(&dst_rq->curr); + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; - else { - /* - * The task_struct must be protected here to protect the - * p->numa_faults access in the task_weight since the - * numa_faults could already be freed in the following path: - * finish_task_switch() - * --> put_task_struct() - * --> __put_task_struct() - * --> task_numa_free() - */ - get_task_struct(cur); - } - - raw_spin_unlock_irq(&dst_rq->lock); /* * Because we have preemption enabled we can get migrated around and @@ -1477,7 +1492,6 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; - put_task_struct(cur); cur = NULL; goto assign; } @@ -1503,16 +1517,9 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: - assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); - /* - * The dst_rq->curr isn't assigned. The protection for task_struct is - * finished. - */ - if (cur && !assigned) - put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -2866,8 +2873,6 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -2914,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { @@ -2969,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) update_tg_load_avg(cfs_rq, 0); } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!sched_feat(ATTACH_AGE_LOAD)) @@ -2977,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. + * + * Or we're fresh through post_init_entity_util_avg(). */ if (se->avg.last_update_time) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -2998,6 +3029,14 @@ skip_aging: cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -3082,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se) u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; last_update_time = cfs_rq_last_update_time(cfs_rq); @@ -3109,6 +3151,12 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + static inline void update_load_avg(struct sched_entity *se, int not_used) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -3698,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task; + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } @@ -3836,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; cfs_rq->throttle_count--; -#ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } -#endif return 0; } @@ -4195,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* Synchronize hierarchical throttle counter: */ - if (unlikely(!cfs_rq->throttle_uptodate)) { - struct rq *rq = rq_of(cfs_rq); - struct cfs_rq *pcfs_rq; - struct task_group *tg; - - cfs_rq->throttle_uptodate = 1; - - /* Get closest up-to-date node, because leaves go first: */ - for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { - pcfs_rq = tg->cfs_rq[cpu_of(rq)]; - if (pcfs_rq->throttle_uptodate) - break; - } - if (tg) { - cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(rq); - } - } - /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4229,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } +static void sync_throttle(struct task_group *tg, int cpu) +{ + struct cfs_rq *pcfs_rq, *cfs_rq; + + if (!cfs_bandwidth_used()) + return; + + if (!tg->parent) + return; + + cfs_rq = tg->cfs_rq[cpu]; + pcfs_rq = tg->parent->cfs_rq[cpu]; + + cfs_rq->throttle_count = pcfs_rq->throttle_count; + pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); +} + /* conditionally throttle active cfs_rq's from put_prev_entity() */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4368,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -4476,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -8317,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -8354,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* @@ -8411,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (!vruntime_normalized(p)) { /* @@ -8422,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8439,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8499,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -8511,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -8562,10 +8622,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); - - raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); - raw_spin_unlock_irq(&rq->lock); } return 1; @@ -8576,6 +8632,23 @@ err: return 0; } +void online_fair_sched_group(struct task_group *tg) +{ + struct sched_entity *se; + struct rq *rq; + int i; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + se = tg->se[i]; + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + sync_throttle(tg, i); + raw_spin_unlock_irq(&rq->lock); + } +} + void unregister_fair_sched_group(struct task_group *tg) { unsigned long flags; @@ -8680,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } +void online_fair_sched_group(struct task_group *tg) { } + void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8739,7 +8814,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c5aeedf4e93a..9fb873cfc75c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -201,6 +201,8 @@ exit_idle: */ static void cpu_idle_loop(void) { + int cpu = smp_processor_id(); + while (1) { /* * If the arch has a polling bit, we maintain an invariant: @@ -219,7 +221,7 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7cbeb92a1cb9..f44da95c70cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -437,7 +438,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count, throttle_uptodate; + int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1246,8 +1247,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group) (struct task_struct *p, int type); #endif }; @@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif - -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - rq->prev_irq_time = 0; -#endif -#ifdef CONFIG_PARAVIRT - rq->prev_steal_time = 0; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - rq->prev_steal_time_rq = 0; -#endif -}