sched: Simplify cpu-hot-unplug task migration
While discussing the need for sched_idle_next(), Oleg remarked that since try_to_wake_up() ensures sleeping tasks will end up running on a sane cpu, we can do away with migrate_live_tasks(). If we then extend the existing hack of migrating current from CPU_DYING to migrating the full rq worth of tasks from CPU_DYING, the need for the sched_idle_next() abomination disappears as well, since idle will be the only possible thread left after the migration thread stops. This greatly simplifies the hot-unplug task migration path, as can be seen from the resulting code reduction (and about half the new lines are comments). Suggested-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1289851597.2109.547.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
		
							parent
							
								
									92fd4d4d67
								
							
						
					
					
						commit
						48c5ccae88
					
				| @ -1871,14 +1871,11 @@ extern void sched_clock_idle_sleep_event(void); | |||||||
| extern void sched_clock_idle_wakeup_event(u64 delta_ns); | extern void sched_clock_idle_wakeup_event(u64 delta_ns); | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_HOTPLUG_CPU | #ifdef CONFIG_HOTPLUG_CPU | ||||||
| extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); |  | ||||||
| extern void idle_task_exit(void); | extern void idle_task_exit(void); | ||||||
| #else | #else | ||||||
| static inline void idle_task_exit(void) {} | static inline void idle_task_exit(void) {} | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| extern void sched_idle_next(void); |  | ||||||
| 
 |  | ||||||
| #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | ||||||
| extern void wake_up_idle_cpu(int cpu); | extern void wake_up_idle_cpu(int cpu); | ||||||
| #else | #else | ||||||
|  | |||||||
							
								
								
									
										16
									
								
								kernel/cpu.c
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								kernel/cpu.c
									
									
									
									
									
								
							| @ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| struct take_cpu_down_param { | struct take_cpu_down_param { | ||||||
| 	struct task_struct *caller; |  | ||||||
| 	unsigned long mod; | 	unsigned long mod; | ||||||
| 	void *hcpu; | 	void *hcpu; | ||||||
| }; | }; | ||||||
| @ -208,11 +207,6 @@ static int __ref take_cpu_down(void *_param) | |||||||
| 
 | 
 | ||||||
| 	cpu_notify(CPU_DYING | param->mod, param->hcpu); | 	cpu_notify(CPU_DYING | param->mod, param->hcpu); | ||||||
| 
 | 
 | ||||||
| 	if (task_cpu(param->caller) == cpu) |  | ||||||
| 		move_task_off_dead_cpu(cpu, param->caller); |  | ||||||
| 	/* Force idle task to run as soon as we yield: it should
 |  | ||||||
| 	   immediately notice cpu is offline and die quickly. */ |  | ||||||
| 	sched_idle_next(); |  | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -223,7 +217,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||||||
| 	void *hcpu = (void *)(long)cpu; | 	void *hcpu = (void *)(long)cpu; | ||||||
| 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | ||||||
| 	struct take_cpu_down_param tcd_param = { | 	struct take_cpu_down_param tcd_param = { | ||||||
| 		.caller = current, |  | ||||||
| 		.mod = mod, | 		.mod = mod, | ||||||
| 		.hcpu = hcpu, | 		.hcpu = hcpu, | ||||||
| 	}; | 	}; | ||||||
| @ -253,9 +246,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||||||
| 	} | 	} | ||||||
| 	BUG_ON(cpu_online(cpu)); | 	BUG_ON(cpu_online(cpu)); | ||||||
| 
 | 
 | ||||||
| 	/* Wait for it to sleep (leaving idle task). */ | 	/*
 | ||||||
| 	while (!idle_cpu(cpu)) | 	 * The migration_call() CPU_DYING callback will have removed all | ||||||
| 		yield(); | 	 * runnable tasks from the cpu, there's only the idle task left now | ||||||
|  | 	 * that the migration thread is done doing the stop_machine thing. | ||||||
|  | 	 */ | ||||||
|  | 	BUG_ON(!idle_cpu(cpu)); | ||||||
| 
 | 
 | ||||||
| 	/* This actually kills the CPU. */ | 	/* This actually kills the CPU. */ | ||||||
| 	__cpu_die(cpu); | 	__cpu_die(cpu); | ||||||
|  | |||||||
							
								
								
									
										236
									
								
								kernel/sched.c
									
									
									
									
									
								
							
							
						
						
									
										236
									
								
								kernel/sched.c
									
									
									
									
									
								
							| @ -2366,18 +2366,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||||||
| 		return dest_cpu; | 		return dest_cpu; | ||||||
| 
 | 
 | ||||||
| 	/* No more Mr. Nice Guy. */ | 	/* No more Mr. Nice Guy. */ | ||||||
| 	if (unlikely(dest_cpu >= nr_cpu_ids)) { | 	dest_cpu = cpuset_cpus_allowed_fallback(p); | ||||||
| 		dest_cpu = cpuset_cpus_allowed_fallback(p); | 	/*
 | ||||||
| 		/*
 | 	 * Don't tell them about moving exiting tasks or | ||||||
| 		 * Don't tell them about moving exiting tasks or | 	 * kernel threads (both mm NULL), since they never | ||||||
| 		 * kernel threads (both mm NULL), since they never | 	 * leave kernel. | ||||||
| 		 * leave kernel. | 	 */ | ||||||
| 		 */ | 	if (p->mm && printk_ratelimit()) { | ||||||
| 		if (p->mm && printk_ratelimit()) { | 		printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", | ||||||
| 			printk(KERN_INFO "process %d (%s) no " | 				task_pid_nr(p), p->comm, cpu); | ||||||
| 			       "longer affine to cpu%d\n", |  | ||||||
| 			       task_pid_nr(p), p->comm, cpu); |  | ||||||
| 		} |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return dest_cpu; | 	return dest_cpu; | ||||||
| @ -5712,96 +5709,6 @@ static int migration_cpu_stop(void *data) | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_HOTPLUG_CPU | #ifdef CONFIG_HOTPLUG_CPU | ||||||
| /*
 |  | ||||||
|  * Figure out where task on dead CPU should go, use force if necessary. |  | ||||||
|  */ |  | ||||||
| void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |  | ||||||
| { |  | ||||||
| 	struct rq *rq = cpu_rq(dead_cpu); |  | ||||||
| 	int needs_cpu, uninitialized_var(dest_cpu); |  | ||||||
| 	unsigned long flags; |  | ||||||
| 
 |  | ||||||
| 	local_irq_save(flags); |  | ||||||
| 
 |  | ||||||
| 	raw_spin_lock(&rq->lock); |  | ||||||
| 	needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); |  | ||||||
| 	if (needs_cpu) |  | ||||||
| 		dest_cpu = select_fallback_rq(dead_cpu, p); |  | ||||||
| 	raw_spin_unlock(&rq->lock); |  | ||||||
| 	/*
 |  | ||||||
| 	 * It can only fail if we race with set_cpus_allowed(), |  | ||||||
| 	 * in the racer should migrate the task anyway. |  | ||||||
| 	 */ |  | ||||||
| 	if (needs_cpu) |  | ||||||
| 		__migrate_task(p, dead_cpu, dest_cpu); |  | ||||||
| 	local_irq_restore(flags); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * While a dead CPU has no uninterruptible tasks queued at this point, |  | ||||||
|  * it might still have a nonzero ->nr_uninterruptible counter, because |  | ||||||
|  * for performance reasons the counter is not stricly tracking tasks to |  | ||||||
|  * their home CPUs. So we just add the counter to another CPU's counter, |  | ||||||
|  * to keep the global sum constant after CPU-down: |  | ||||||
|  */ |  | ||||||
| static void migrate_nr_uninterruptible(struct rq *rq_src) |  | ||||||
| { |  | ||||||
| 	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |  | ||||||
| 	unsigned long flags; |  | ||||||
| 
 |  | ||||||
| 	local_irq_save(flags); |  | ||||||
| 	double_rq_lock(rq_src, rq_dest); |  | ||||||
| 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |  | ||||||
| 	rq_src->nr_uninterruptible = 0; |  | ||||||
| 	double_rq_unlock(rq_src, rq_dest); |  | ||||||
| 	local_irq_restore(flags); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /* Run through task list and migrate tasks from the dead cpu. */ |  | ||||||
| static void migrate_live_tasks(int src_cpu) |  | ||||||
| { |  | ||||||
| 	struct task_struct *p, *t; |  | ||||||
| 
 |  | ||||||
| 	read_lock(&tasklist_lock); |  | ||||||
| 
 |  | ||||||
| 	do_each_thread(t, p) { |  | ||||||
| 		if (p == current) |  | ||||||
| 			continue; |  | ||||||
| 
 |  | ||||||
| 		if (task_cpu(p) == src_cpu) |  | ||||||
| 			move_task_off_dead_cpu(src_cpu, p); |  | ||||||
| 	} while_each_thread(t, p); |  | ||||||
| 
 |  | ||||||
| 	read_unlock(&tasklist_lock); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * Schedules idle task to be the next runnable task on current CPU. |  | ||||||
|  * It does so by boosting its priority to highest possible. |  | ||||||
|  * Used by CPU offline code. |  | ||||||
|  */ |  | ||||||
| void sched_idle_next(void) |  | ||||||
| { |  | ||||||
| 	int this_cpu = smp_processor_id(); |  | ||||||
| 	struct rq *rq = cpu_rq(this_cpu); |  | ||||||
| 	struct task_struct *p = rq->idle; |  | ||||||
| 	unsigned long flags; |  | ||||||
| 
 |  | ||||||
| 	/* cpu has to be offline */ |  | ||||||
| 	BUG_ON(cpu_online(this_cpu)); |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * Strictly not necessary since rest of the CPUs are stopped by now |  | ||||||
| 	 * and interrupts disabled on the current cpu. |  | ||||||
| 	 */ |  | ||||||
| 	raw_spin_lock_irqsave(&rq->lock, flags); |  | ||||||
| 
 |  | ||||||
| 	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |  | ||||||
| 
 |  | ||||||
| 	activate_task(rq, p, 0); |  | ||||||
| 
 |  | ||||||
| 	raw_spin_unlock_irqrestore(&rq->lock, flags); |  | ||||||
| } |  | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Ensures that the idle task is using init_mm right before its cpu goes |  * Ensures that the idle task is using init_mm right before its cpu goes | ||||||
| @ -5818,47 +5725,19 @@ void idle_task_exit(void) | |||||||
| 	mmdrop(mm); | 	mmdrop(mm); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* called under rq->lock with disabled interrupts */ | /*
 | ||||||
| static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |  * While a dead CPU has no uninterruptible tasks queued at this point, | ||||||
|  |  * it might still have a nonzero ->nr_uninterruptible counter, because | ||||||
|  |  * for performance reasons the counter is not stricly tracking tasks to | ||||||
|  |  * their home CPUs. So we just add the counter to another CPU's counter, | ||||||
|  |  * to keep the global sum constant after CPU-down: | ||||||
|  |  */ | ||||||
|  | static void migrate_nr_uninterruptible(struct rq *rq_src) | ||||||
| { | { | ||||||
| 	struct rq *rq = cpu_rq(dead_cpu); | 	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | ||||||
| 
 | 
 | ||||||
| 	/* Must be exiting, otherwise would be on tasklist. */ | 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | ||||||
| 	BUG_ON(!p->exit_state); | 	rq_src->nr_uninterruptible = 0; | ||||||
| 
 |  | ||||||
| 	/* Cannot have done final schedule yet: would have vanished. */ |  | ||||||
| 	BUG_ON(p->state == TASK_DEAD); |  | ||||||
| 
 |  | ||||||
| 	get_task_struct(p); |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * Drop lock around migration; if someone else moves it, |  | ||||||
| 	 * that's OK. No task can be added to this CPU, so iteration is |  | ||||||
| 	 * fine. |  | ||||||
| 	 */ |  | ||||||
| 	raw_spin_unlock_irq(&rq->lock); |  | ||||||
| 	move_task_off_dead_cpu(dead_cpu, p); |  | ||||||
| 	raw_spin_lock_irq(&rq->lock); |  | ||||||
| 
 |  | ||||||
| 	put_task_struct(p); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /* release_task() removes task from tasklist, so we won't find dead tasks. */ |  | ||||||
| static void migrate_dead_tasks(unsigned int dead_cpu) |  | ||||||
| { |  | ||||||
| 	struct rq *rq = cpu_rq(dead_cpu); |  | ||||||
| 	struct task_struct *next; |  | ||||||
| 
 |  | ||||||
| 	for ( ; ; ) { |  | ||||||
| 		if (!rq->nr_running) |  | ||||||
| 			break; |  | ||||||
| 		next = pick_next_task(rq); |  | ||||||
| 		if (!next) |  | ||||||
| 			break; |  | ||||||
| 		next->sched_class->put_prev_task(rq, next); |  | ||||||
| 		migrate_dead(dead_cpu, next); |  | ||||||
| 
 |  | ||||||
| 	} |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
| @ -5869,6 +5748,56 @@ static void calc_global_load_remove(struct rq *rq) | |||||||
| 	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | 	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||||||
| 	rq->calc_load_active = 0; | 	rq->calc_load_active = 0; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Migrate all tasks from the rq, sleeping tasks will be migrated by | ||||||
|  |  * try_to_wake_up()->select_task_rq(). | ||||||
|  |  * | ||||||
|  |  * Called with rq->lock held even though we'er in stop_machine() and | ||||||
|  |  * there's no concurrency possible, we hold the required locks anyway | ||||||
|  |  * because of lock validation efforts. | ||||||
|  |  */ | ||||||
|  | static void migrate_tasks(unsigned int dead_cpu) | ||||||
|  | { | ||||||
|  | 	struct rq *rq = cpu_rq(dead_cpu); | ||||||
|  | 	struct task_struct *next, *stop = rq->stop; | ||||||
|  | 	int dest_cpu; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Fudge the rq selection such that the below task selection loop | ||||||
|  | 	 * doesn't get stuck on the currently eligible stop task. | ||||||
|  | 	 * | ||||||
|  | 	 * We're currently inside stop_machine() and the rq is either stuck | ||||||
|  | 	 * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||||||
|  | 	 * either way we should never end up calling schedule() until we're | ||||||
|  | 	 * done here. | ||||||
|  | 	 */ | ||||||
|  | 	rq->stop = NULL; | ||||||
|  | 
 | ||||||
|  | 	for ( ; ; ) { | ||||||
|  | 		/*
 | ||||||
|  | 		 * There's this thread running, bail when that's the only | ||||||
|  | 		 * remaining thread. | ||||||
|  | 		 */ | ||||||
|  | 		if (rq->nr_running == 1) | ||||||
|  | 			break; | ||||||
|  | 
 | ||||||
|  | 		next = pick_next_task(rq); | ||||||
|  | 		BUG_ON(!next); | ||||||
|  | 		next->sched_class->put_prev_task(rq, next); | ||||||
|  | 
 | ||||||
|  | 		/* Find suitable destination for @next, with force if needed. */ | ||||||
|  | 		dest_cpu = select_fallback_rq(dead_cpu, next); | ||||||
|  | 		raw_spin_unlock(&rq->lock); | ||||||
|  | 
 | ||||||
|  | 		__migrate_task(next, dead_cpu, dest_cpu); | ||||||
|  | 
 | ||||||
|  | 		raw_spin_lock(&rq->lock); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	rq->stop = stop; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #endif /* CONFIG_HOTPLUG_CPU */ | #endif /* CONFIG_HOTPLUG_CPU */ | ||||||
| 
 | 
 | ||||||
| #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | ||||||
| @ -6078,15 +6007,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
| 	struct rq *rq = cpu_rq(cpu); | 	struct rq *rq = cpu_rq(cpu); | ||||||
| 
 | 
 | ||||||
| 	switch (action) { | 	switch (action & ~CPU_TASKS_FROZEN) { | ||||||
| 
 | 
 | ||||||
| 	case CPU_UP_PREPARE: | 	case CPU_UP_PREPARE: | ||||||
| 	case CPU_UP_PREPARE_FROZEN: |  | ||||||
| 		rq->calc_load_update = calc_load_update; | 		rq->calc_load_update = calc_load_update; | ||||||
| 		break; | 		break; | ||||||
| 
 | 
 | ||||||
| 	case CPU_ONLINE: | 	case CPU_ONLINE: | ||||||
| 	case CPU_ONLINE_FROZEN: |  | ||||||
| 		/* Update our root-domain */ | 		/* Update our root-domain */ | ||||||
| 		raw_spin_lock_irqsave(&rq->lock, flags); | 		raw_spin_lock_irqsave(&rq->lock, flags); | ||||||
| 		if (rq->rd) { | 		if (rq->rd) { | ||||||
| @ -6098,30 +6025,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||||||
| 		break; | 		break; | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_HOTPLUG_CPU | #ifdef CONFIG_HOTPLUG_CPU | ||||||
| 	case CPU_DEAD: |  | ||||||
| 	case CPU_DEAD_FROZEN: |  | ||||||
| 		migrate_live_tasks(cpu); |  | ||||||
| 		/* Idle task back to normal (off runqueue, low prio) */ |  | ||||||
| 		raw_spin_lock_irq(&rq->lock); |  | ||||||
| 		deactivate_task(rq, rq->idle, 0); |  | ||||||
| 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |  | ||||||
| 		rq->idle->sched_class = &idle_sched_class; |  | ||||||
| 		migrate_dead_tasks(cpu); |  | ||||||
| 		raw_spin_unlock_irq(&rq->lock); |  | ||||||
| 		migrate_nr_uninterruptible(rq); |  | ||||||
| 		BUG_ON(rq->nr_running != 0); |  | ||||||
| 		calc_global_load_remove(rq); |  | ||||||
| 		break; |  | ||||||
| 
 |  | ||||||
| 	case CPU_DYING: | 	case CPU_DYING: | ||||||
| 	case CPU_DYING_FROZEN: |  | ||||||
| 		/* Update our root-domain */ | 		/* Update our root-domain */ | ||||||
| 		raw_spin_lock_irqsave(&rq->lock, flags); | 		raw_spin_lock_irqsave(&rq->lock, flags); | ||||||
| 		if (rq->rd) { | 		if (rq->rd) { | ||||||
| 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | ||||||
| 			set_rq_offline(rq); | 			set_rq_offline(rq); | ||||||
| 		} | 		} | ||||||
|  | 		migrate_tasks(cpu); | ||||||
|  | 		BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||||||
| 		raw_spin_unlock_irqrestore(&rq->lock, flags); | 		raw_spin_unlock_irqrestore(&rq->lock, flags); | ||||||
|  | 
 | ||||||
|  | 		migrate_nr_uninterruptible(rq); | ||||||
|  | 		calc_global_load_remove(rq); | ||||||
| 		break; | 		break; | ||||||
| #endif | #endif | ||||||
| 	} | 	} | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user