cgroups: mechanism to process each task in a cgroup
Provide cgroup_scan_tasks(), which iterates through every task in a cgroup, calling a test function and a process function for each. And call the process function without holding the css_set_lock lock. The idea is David Rientjes', predicting that such a function will make it much easier in the future to extend things that require access to each task in a cgroup without holding the lock, [akpm@linux-foundation.org: cleanup] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
dfc05c259e
commit
31a7df01fd
@ -14,6 +14,7 @@
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/cgroupstats.h>
|
||||
#include <linux/prio_heap.h>
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
|
||||
@ -207,6 +208,14 @@ struct cftype {
|
||||
int (*release) (struct inode *inode, struct file *file);
|
||||
};
|
||||
|
||||
struct cgroup_scanner {
|
||||
struct cgroup *cg;
|
||||
int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
|
||||
void (*process_task)(struct task_struct *p,
|
||||
struct cgroup_scanner *scan);
|
||||
struct ptr_heap *heap;
|
||||
};
|
||||
|
||||
/* Add a new file to the given cgroup directory. Should only be
|
||||
* called by subsystems from within a populate() method */
|
||||
int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
|
||||
@ -299,11 +308,16 @@ struct cgroup_iter {
|
||||
* returns NULL or until you want to end the iteration
|
||||
*
|
||||
* 3) call cgroup_iter_end() to destroy the iterator.
|
||||
*
|
||||
* Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
|
||||
* - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
|
||||
* callback, but not while calling the process_task() callback.
|
||||
*/
|
||||
void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
|
||||
struct task_struct *cgroup_iter_next(struct cgroup *cont,
|
||||
struct cgroup_iter *it);
|
||||
void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
|
||||
int cgroup_scan_tasks(struct cgroup_scanner *scan);
|
||||
|
||||
#else /* !CONFIG_CGROUPS */
|
||||
|
||||
|
198
kernel/cgroup.c
198
kernel/cgroup.c
@ -1695,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
|
||||
it->task = cg->tasks.next;
|
||||
}
|
||||
|
||||
/*
|
||||
* To reduce the fork() overhead for systems that are not actually
|
||||
* using their cgroups capability, we don't maintain the lists running
|
||||
* through each css_set to its tasks until we see the list actually
|
||||
* used - in other words after the first call to cgroup_iter_start().
|
||||
*
|
||||
* The tasklist_lock is not held here, as do_each_thread() and
|
||||
* while_each_thread() are protected by RCU.
|
||||
*/
|
||||
void cgroup_enable_task_cg_lists(void)
|
||||
{
|
||||
struct task_struct *p, *g;
|
||||
write_lock(&css_set_lock);
|
||||
use_task_css_set_links = 1;
|
||||
do_each_thread(g, p) {
|
||||
task_lock(p);
|
||||
if (list_empty(&p->cg_list))
|
||||
list_add(&p->cg_list, &p->cgroups->tasks);
|
||||
task_unlock(p);
|
||||
} while_each_thread(g, p);
|
||||
write_unlock(&css_set_lock);
|
||||
}
|
||||
|
||||
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
|
||||
{
|
||||
/*
|
||||
@ -1702,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
|
||||
* we need to enable the list linking each css_set to its
|
||||
* tasks, and fix up all existing tasks.
|
||||
*/
|
||||
if (!use_task_css_set_links) {
|
||||
struct task_struct *p, *g;
|
||||
write_lock(&css_set_lock);
|
||||
use_task_css_set_links = 1;
|
||||
do_each_thread(g, p) {
|
||||
task_lock(p);
|
||||
if (list_empty(&p->cg_list))
|
||||
list_add(&p->cg_list, &p->cgroups->tasks);
|
||||
task_unlock(p);
|
||||
} while_each_thread(g, p);
|
||||
write_unlock(&css_set_lock);
|
||||
}
|
||||
if (!use_task_css_set_links)
|
||||
cgroup_enable_task_cg_lists();
|
||||
|
||||
read_lock(&css_set_lock);
|
||||
it->cg_link = &cgrp->css_sets;
|
||||
cgroup_advance_iter(cgrp, it);
|
||||
@ -1746,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
|
||||
read_unlock(&css_set_lock);
|
||||
}
|
||||
|
||||
static inline int started_after_time(struct task_struct *t1,
|
||||
struct timespec *time,
|
||||
struct task_struct *t2)
|
||||
{
|
||||
int start_diff = timespec_compare(&t1->start_time, time);
|
||||
if (start_diff > 0) {
|
||||
return 1;
|
||||
} else if (start_diff < 0) {
|
||||
return 0;
|
||||
} else {
|
||||
/*
|
||||
* Arbitrarily, if two processes started at the same
|
||||
* time, we'll say that the lower pointer value
|
||||
* started first. Note that t2 may have exited by now
|
||||
* so this may not be a valid pointer any longer, but
|
||||
* that's fine - it still serves to distinguish
|
||||
* between two tasks started (effectively) simultaneously.
|
||||
*/
|
||||
return t1 > t2;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is a callback from heap_insert() and is used to order
|
||||
* the heap.
|
||||
* In this case we order the heap in descending task start time.
|
||||
*/
|
||||
static inline int started_after(void *p1, void *p2)
|
||||
{
|
||||
struct task_struct *t1 = p1;
|
||||
struct task_struct *t2 = p2;
|
||||
return started_after_time(t1, &t2->start_time, t2);
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_scan_tasks - iterate though all the tasks in a cgroup
|
||||
* @scan: struct cgroup_scanner containing arguments for the scan
|
||||
*
|
||||
* Arguments include pointers to callback functions test_task() and
|
||||
* process_task().
|
||||
* Iterate through all the tasks in a cgroup, calling test_task() for each,
|
||||
* and if it returns true, call process_task() for it also.
|
||||
* The test_task pointer may be NULL, meaning always true (select all tasks).
|
||||
* Effectively duplicates cgroup_iter_{start,next,end}()
|
||||
* but does not lock css_set_lock for the call to process_task().
|
||||
* The struct cgroup_scanner may be embedded in any structure of the caller's
|
||||
* creation.
|
||||
* It is guaranteed that process_task() will act on every task that
|
||||
* is a member of the cgroup for the duration of this call. This
|
||||
* function may or may not call process_task() for tasks that exit
|
||||
* or move to a different cgroup during the call, or are forked or
|
||||
* move into the cgroup during the call.
|
||||
*
|
||||
* Note that test_task() may be called with locks held, and may in some
|
||||
* situations be called multiple times for the same task, so it should
|
||||
* be cheap.
|
||||
* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
|
||||
* pre-allocated and will be used for heap operations (and its "gt" member will
|
||||
* be overwritten), else a temporary heap will be used (allocation of which
|
||||
* may cause this function to fail).
|
||||
*/
|
||||
int cgroup_scan_tasks(struct cgroup_scanner *scan)
|
||||
{
|
||||
int retval, i;
|
||||
struct cgroup_iter it;
|
||||
struct task_struct *p, *dropped;
|
||||
/* Never dereference latest_task, since it's not refcounted */
|
||||
struct task_struct *latest_task = NULL;
|
||||
struct ptr_heap tmp_heap;
|
||||
struct ptr_heap *heap;
|
||||
struct timespec latest_time = { 0, 0 };
|
||||
|
||||
if (scan->heap) {
|
||||
/* The caller supplied our heap and pre-allocated its memory */
|
||||
heap = scan->heap;
|
||||
heap->gt = &started_after;
|
||||
} else {
|
||||
/* We need to allocate our own heap memory */
|
||||
heap = &tmp_heap;
|
||||
retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
|
||||
if (retval)
|
||||
/* cannot allocate the heap */
|
||||
return retval;
|
||||
}
|
||||
|
||||
again:
|
||||
/*
|
||||
* Scan tasks in the cgroup, using the scanner's "test_task" callback
|
||||
* to determine which are of interest, and using the scanner's
|
||||
* "process_task" callback to process any of them that need an update.
|
||||
* Since we don't want to hold any locks during the task updates,
|
||||
* gather tasks to be processed in a heap structure.
|
||||
* The heap is sorted by descending task start time.
|
||||
* If the statically-sized heap fills up, we overflow tasks that
|
||||
* started later, and in future iterations only consider tasks that
|
||||
* started after the latest task in the previous pass. This
|
||||
* guarantees forward progress and that we don't miss any tasks.
|
||||
*/
|
||||
heap->size = 0;
|
||||
cgroup_iter_start(scan->cg, &it);
|
||||
while ((p = cgroup_iter_next(scan->cg, &it))) {
|
||||
/*
|
||||
* Only affect tasks that qualify per the caller's callback,
|
||||
* if he provided one
|
||||
*/
|
||||
if (scan->test_task && !scan->test_task(p, scan))
|
||||
continue;
|
||||
/*
|
||||
* Only process tasks that started after the last task
|
||||
* we processed
|
||||
*/
|
||||
if (!started_after_time(p, &latest_time, latest_task))
|
||||
continue;
|
||||
dropped = heap_insert(heap, p);
|
||||
if (dropped == NULL) {
|
||||
/*
|
||||
* The new task was inserted; the heap wasn't
|
||||
* previously full
|
||||
*/
|
||||
get_task_struct(p);
|
||||
} else if (dropped != p) {
|
||||
/*
|
||||
* The new task was inserted, and pushed out a
|
||||
* different task
|
||||
*/
|
||||
get_task_struct(p);
|
||||
put_task_struct(dropped);
|
||||
}
|
||||
/*
|
||||
* Else the new task was newer than anything already in
|
||||
* the heap and wasn't inserted
|
||||
*/
|
||||
}
|
||||
cgroup_iter_end(scan->cg, &it);
|
||||
|
||||
if (heap->size) {
|
||||
for (i = 0; i < heap->size; i++) {
|
||||
struct task_struct *p = heap->ptrs[i];
|
||||
if (i == 0) {
|
||||
latest_time = p->start_time;
|
||||
latest_task = p;
|
||||
}
|
||||
/* Process the task per the caller's callback */
|
||||
scan->process_task(p, scan);
|
||||
put_task_struct(p);
|
||||
}
|
||||
/*
|
||||
* If we had to process any tasks at all, scan again
|
||||
* in case some of them were in the middle of forking
|
||||
* children that didn't get processed.
|
||||
* Not the most efficient way to do it, but it avoids
|
||||
* having to take callback_mutex in the fork path
|
||||
*/
|
||||
goto again;
|
||||
}
|
||||
if (heap == &tmp_heap)
|
||||
heap_free(&tmp_heap);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Stuff for reading the 'tasks' file.
|
||||
*
|
||||
|
Loading…
Reference in New Issue
Block a user