cgroup: drop task_lock() protection around task->cgroups
For optimization, task_lock() is additionally used to protect task->cgroups. The optimization is pretty dubious as either css_set_rwsem is grabbed anyway or PF_EXITING already protects task->cgroups. It adds only overhead and confusion at this point. Let's drop task_[un]lock() and update comments accordingly. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
This commit is contained in:
parent
eaf797abc5
commit
0e1d768f1b
@ -658,10 +658,12 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
|
|||||||
*/
|
*/
|
||||||
#ifdef CONFIG_PROVE_RCU
|
#ifdef CONFIG_PROVE_RCU
|
||||||
extern struct mutex cgroup_mutex;
|
extern struct mutex cgroup_mutex;
|
||||||
|
extern struct rw_semaphore css_set_rwsem;
|
||||||
#define task_css_set_check(task, __c) \
|
#define task_css_set_check(task, __c) \
|
||||||
rcu_dereference_check((task)->cgroups, \
|
rcu_dereference_check((task)->cgroups, \
|
||||||
lockdep_is_held(&(task)->alloc_lock) || \
|
lockdep_is_held(&cgroup_mutex) || \
|
||||||
lockdep_is_held(&cgroup_mutex) || (__c))
|
lockdep_is_held(&css_set_rwsem) || \
|
||||||
|
((task)->flags & PF_EXITING) || (__c))
|
||||||
#else
|
#else
|
||||||
#define task_css_set_check(task, __c) \
|
#define task_css_set_check(task, __c) \
|
||||||
rcu_dereference((task)->cgroups)
|
rcu_dereference((task)->cgroups)
|
||||||
|
@ -80,12 +80,21 @@ static DEFINE_MUTEX(cgroup_tree_mutex);
|
|||||||
/*
|
/*
|
||||||
* cgroup_mutex is the master lock. Any modification to cgroup or its
|
* cgroup_mutex is the master lock. Any modification to cgroup or its
|
||||||
* hierarchy must be performed while holding it.
|
* hierarchy must be performed while holding it.
|
||||||
|
*
|
||||||
|
* css_set_rwsem protects task->cgroups pointer, the list of css_set
|
||||||
|
* objects, and the chain of tasks off each css_set.
|
||||||
|
*
|
||||||
|
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
|
||||||
|
* cgroup.h can use them for lockdep annotations.
|
||||||
*/
|
*/
|
||||||
#ifdef CONFIG_PROVE_RCU
|
#ifdef CONFIG_PROVE_RCU
|
||||||
DEFINE_MUTEX(cgroup_mutex);
|
DEFINE_MUTEX(cgroup_mutex);
|
||||||
EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
|
DECLARE_RWSEM(css_set_rwsem);
|
||||||
|
EXPORT_SYMBOL_GPL(cgroup_mutex);
|
||||||
|
EXPORT_SYMBOL_GPL(css_set_rwsem);
|
||||||
#else
|
#else
|
||||||
static DEFINE_MUTEX(cgroup_mutex);
|
static DEFINE_MUTEX(cgroup_mutex);
|
||||||
|
static DECLARE_RWSEM(css_set_rwsem);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -338,12 +347,6 @@ struct cgrp_cset_link {
|
|||||||
|
|
||||||
static struct css_set init_css_set;
|
static struct css_set init_css_set;
|
||||||
static struct cgrp_cset_link init_cgrp_cset_link;
|
static struct cgrp_cset_link init_cgrp_cset_link;
|
||||||
|
|
||||||
/*
|
|
||||||
* css_set_rwsem protects the list of css_set objects, and the chain of
|
|
||||||
* tasks off each css_set.
|
|
||||||
*/
|
|
||||||
static DECLARE_RWSEM(css_set_rwsem);
|
|
||||||
static int css_set_count;
|
static int css_set_count;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -803,10 +806,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* There is one global cgroup mutex. We also require taking
|
|
||||||
* task_lock() when dereferencing a task's cgroup subsys pointers.
|
|
||||||
* See "The task_lock() exception", at the end of this comment.
|
|
||||||
*
|
|
||||||
* A task must hold cgroup_mutex to modify cgroups.
|
* A task must hold cgroup_mutex to modify cgroups.
|
||||||
*
|
*
|
||||||
* Any task can increment and decrement the count field without lock.
|
* Any task can increment and decrement the count field without lock.
|
||||||
@ -836,18 +835,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
|
|||||||
* always has either children cgroups and/or using tasks. So we don't
|
* always has either children cgroups and/or using tasks. So we don't
|
||||||
* need a special hack to ensure that top_cgroup cannot be deleted.
|
* need a special hack to ensure that top_cgroup cannot be deleted.
|
||||||
*
|
*
|
||||||
* The task_lock() exception
|
|
||||||
*
|
|
||||||
* The need for this exception arises from the action of
|
|
||||||
* cgroup_attach_task(), which overwrites one task's cgroup pointer with
|
|
||||||
* another. It does so using cgroup_mutex, however there are
|
|
||||||
* several performance critical places that need to reference
|
|
||||||
* task->cgroup without the expense of grabbing a system global
|
|
||||||
* mutex. Therefore except as noted below, when dereferencing or, as
|
|
||||||
* in cgroup_attach_task(), modifying a task's cgroup pointer we use
|
|
||||||
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
|
|
||||||
* the task_struct routinely used for such matters.
|
|
||||||
*
|
|
||||||
* P.S. One more locking exception. RCU is used to guard the
|
* P.S. One more locking exception. RCU is used to guard the
|
||||||
* update of a tasks cgroup pointer by cgroup_attach_task()
|
* update of a tasks cgroup pointer by cgroup_attach_task()
|
||||||
*/
|
*/
|
||||||
@ -1329,8 +1316,6 @@ static void cgroup_enable_task_cg_lists(void)
|
|||||||
*/
|
*/
|
||||||
read_lock(&tasklist_lock);
|
read_lock(&tasklist_lock);
|
||||||
do_each_thread(g, p) {
|
do_each_thread(g, p) {
|
||||||
task_lock(p);
|
|
||||||
|
|
||||||
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
|
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
|
||||||
task_css_set(p) != &init_css_set);
|
task_css_set(p) != &init_css_set);
|
||||||
|
|
||||||
@ -1349,8 +1334,6 @@ static void cgroup_enable_task_cg_lists(void)
|
|||||||
get_css_set(cset);
|
get_css_set(cset);
|
||||||
}
|
}
|
||||||
spin_unlock_irq(&p->sighand->siglock);
|
spin_unlock_irq(&p->sighand->siglock);
|
||||||
|
|
||||||
task_unlock(p);
|
|
||||||
} while_each_thread(g, p);
|
} while_each_thread(g, p);
|
||||||
read_unlock(&tasklist_lock);
|
read_unlock(&tasklist_lock);
|
||||||
out_unlock:
|
out_unlock:
|
||||||
@ -1743,11 +1726,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
|
|||||||
old_cset = task_css_set(tsk);
|
old_cset = task_css_set(tsk);
|
||||||
|
|
||||||
get_css_set(new_cset);
|
get_css_set(new_cset);
|
||||||
|
|
||||||
task_lock(tsk);
|
|
||||||
rcu_assign_pointer(tsk->cgroups, new_cset);
|
rcu_assign_pointer(tsk->cgroups, new_cset);
|
||||||
task_unlock(tsk);
|
|
||||||
|
|
||||||
list_move(&tsk->cg_list, &new_cset->mg_tasks);
|
list_move(&tsk->cg_list, &new_cset->mg_tasks);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1999,8 +1978,7 @@ out_release_tset:
|
|||||||
* @leader: the task or the leader of the threadgroup to be attached
|
* @leader: the task or the leader of the threadgroup to be attached
|
||||||
* @threadgroup: attach the whole threadgroup?
|
* @threadgroup: attach the whole threadgroup?
|
||||||
*
|
*
|
||||||
* Call holding cgroup_mutex and the group_rwsem of the leader. Will take
|
* Call holding cgroup_mutex and threadgroup_lock of @leader.
|
||||||
* task_lock of @tsk or each thread in the threadgroup individually in turn.
|
|
||||||
*/
|
*/
|
||||||
static int cgroup_attach_task(struct cgroup *dst_cgrp,
|
static int cgroup_attach_task(struct cgroup *dst_cgrp,
|
||||||
struct task_struct *leader, bool threadgroup)
|
struct task_struct *leader, bool threadgroup)
|
||||||
@ -2034,7 +2012,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
|
|||||||
/*
|
/*
|
||||||
* Find the task_struct of the task to attach by vpid and pass it along to the
|
* Find the task_struct of the task to attach by vpid and pass it along to the
|
||||||
* function to attach either it or all tasks in its threadgroup. Will lock
|
* function to attach either it or all tasks in its threadgroup. Will lock
|
||||||
* cgroup_mutex and threadgroup; may take task_lock of task.
|
* cgroup_mutex and threadgroup.
|
||||||
*/
|
*/
|
||||||
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
|
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
|
||||||
{
|
{
|
||||||
@ -4155,12 +4133,6 @@ core_initcall(cgroup_wq_init);
|
|||||||
* proc_cgroup_show()
|
* proc_cgroup_show()
|
||||||
* - Print task's cgroup paths into seq_file, one line for each hierarchy
|
* - Print task's cgroup paths into seq_file, one line for each hierarchy
|
||||||
* - Used for /proc/<pid>/cgroup.
|
* - Used for /proc/<pid>/cgroup.
|
||||||
* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
|
|
||||||
* doesn't really matter if tsk->cgroup changes after we read it,
|
|
||||||
* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
|
|
||||||
* anyway. No need to check that tsk->cgroup != NULL, thanks to
|
|
||||||
* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
|
|
||||||
* cgroup to top_cgroup.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* TODO: Use a proper seq_file iterator */
|
/* TODO: Use a proper seq_file iterator */
|
||||||
@ -4310,15 +4282,12 @@ void cgroup_post_fork(struct task_struct *child)
|
|||||||
struct css_set *cset;
|
struct css_set *cset;
|
||||||
|
|
||||||
down_write(&css_set_rwsem);
|
down_write(&css_set_rwsem);
|
||||||
cset = task_css_set_check(current,
|
cset = task_css_set(current);
|
||||||
lockdep_is_held(&css_set_rwsem));
|
|
||||||
task_lock(child);
|
|
||||||
if (list_empty(&child->cg_list)) {
|
if (list_empty(&child->cg_list)) {
|
||||||
rcu_assign_pointer(child->cgroups, cset);
|
rcu_assign_pointer(child->cgroups, cset);
|
||||||
list_add(&child->cg_list, &cset->tasks);
|
list_add(&child->cg_list, &cset->tasks);
|
||||||
get_css_set(cset);
|
get_css_set(cset);
|
||||||
}
|
}
|
||||||
task_unlock(child);
|
|
||||||
up_write(&css_set_rwsem);
|
up_write(&css_set_rwsem);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4347,27 +4316,13 @@ void cgroup_post_fork(struct task_struct *child)
|
|||||||
* use notify_on_release cgroups where very high task exit scaling
|
* use notify_on_release cgroups where very high task exit scaling
|
||||||
* is required on large systems.
|
* is required on large systems.
|
||||||
*
|
*
|
||||||
* the_top_cgroup_hack:
|
* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
|
||||||
*
|
* call cgroup_exit() while the task is still competent to handle
|
||||||
* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
|
* notify_on_release(), then leave the task attached to the root cgroup in
|
||||||
*
|
* each hierarchy for the remainder of its exit. No need to bother with
|
||||||
* We call cgroup_exit() while the task is still competent to
|
* init_css_set refcnting. init_css_set never goes away and we can't race
|
||||||
* handle notify_on_release(), then leave the task attached to the
|
* with migration path - either PF_EXITING is visible to migration path or
|
||||||
* root cgroup in each hierarchy for the remainder of its exit.
|
* @tsk never got on the tasklist.
|
||||||
*
|
|
||||||
* To do this properly, we would increment the reference count on
|
|
||||||
* top_cgroup, and near the very end of the kernel/exit.c do_exit()
|
|
||||||
* code we would add a second cgroup function call, to drop that
|
|
||||||
* reference. This would just create an unnecessary hot spot on
|
|
||||||
* the top_cgroup reference count, to no avail.
|
|
||||||
*
|
|
||||||
* Normally, holding a reference to a cgroup without bumping its
|
|
||||||
* count is unsafe. The cgroup could go away, or someone could
|
|
||||||
* attach us to a different cgroup, decrementing the count on
|
|
||||||
* the first cgroup that we never incremented. But in this case,
|
|
||||||
* top_cgroup isn't going away, and either task has PF_EXITING set,
|
|
||||||
* which wards off any cgroup_attach_task() attempts, or task is a failed
|
|
||||||
* fork, never visible to cgroup_attach_task.
|
|
||||||
*/
|
*/
|
||||||
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
|
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
|
||||||
{
|
{
|
||||||
@ -4377,20 +4332,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
|
|||||||
int i;
|
int i;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Unlink from the css_set task list if necessary. Optimistically
|
* Unlink from @tsk from its css_set. As migration path can't race
|
||||||
* check cg_list before taking css_set_rwsem.
|
* with us, we can check cg_list without grabbing css_set_rwsem.
|
||||||
*/
|
*/
|
||||||
if (!list_empty(&tsk->cg_list)) {
|
if (!list_empty(&tsk->cg_list)) {
|
||||||
down_write(&css_set_rwsem);
|
down_write(&css_set_rwsem);
|
||||||
if (!list_empty(&tsk->cg_list)) {
|
|
||||||
list_del_init(&tsk->cg_list);
|
list_del_init(&tsk->cg_list);
|
||||||
put_cset = true;
|
|
||||||
}
|
|
||||||
up_write(&css_set_rwsem);
|
up_write(&css_set_rwsem);
|
||||||
|
put_cset = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Reassign the task to the init_css_set. */
|
/* Reassign the task to the init_css_set. */
|
||||||
task_lock(tsk);
|
|
||||||
cset = task_css_set(tsk);
|
cset = task_css_set(tsk);
|
||||||
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
|
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
|
||||||
|
|
||||||
@ -4405,7 +4357,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
task_unlock(tsk);
|
|
||||||
|
|
||||||
if (put_cset)
|
if (put_cset)
|
||||||
put_css_set(cset, true);
|
put_css_set(cset, true);
|
||||||
|
Loading…
Reference in New Issue
Block a user