mirror of
https://github.com/torvalds/linux.git
synced 2024-12-25 12:21:37 +00:00
Merge branch 'for-4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - Waiman's cgroup2 cpuset support has been finally merged closing one of the last remaining feature gaps. - cgroup.procs could show non-leader threads when cgroup2 threaded mode was used in certain ways. I forgot to push the fix during the last cycle. - A patch to fix mount option parsing when all mount options have been consumed by someone else (LSM). - cgroup_no_v1 boot param can now block named cgroup1 hierarchies too. * 'for-4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: Add named hierarchy disabling to cgroup_no_v1 boot param cgroup: fix parsing empty mount option string cpuset: Remove set but not used variable 'cs' cgroup: fix CSS_TASK_ITER_PROCS cgroup: Add .__DEBUG__. prefix to debug file names cpuset: Minor cgroup2 interface updates cpuset: Expose cpuset.cpus.subpartitions with cgroup_debug cpuset: Add documentation about the new "cpuset.sched.partition" flag cpuset: Use descriptive text when reading/writing cpuset.sched.partition cpuset: Expose cpus.effective and mems.effective on cgroup v2 root cpuset: Make generate_sched_domains() work with partition cpuset: Make CPU hotplug work with partition cpuset: Track cpusets that use parent's effective_cpus cpuset: Add an error state to cpuset.sched.partition cpuset: Add new v2 cpuset.sched.partition flag cpuset: Simply allocation and freeing of cpumasks cpuset: Define data structures to support scheduling partition cpuset: Enable cpuset controller in default hierarchy cgroup: remove unnecessary unlikely()
This commit is contained in:
commit
6f9d71c9c7
@ -56,11 +56,13 @@ v1 is available under Documentation/cgroup-v1/.
|
||||
5-3-3-2. IO Latency Interface Files
|
||||
5-4. PID
|
||||
5-4-1. PID Interface Files
|
||||
5-5. Device
|
||||
5-6. RDMA
|
||||
5-6-1. RDMA Interface Files
|
||||
5-7. Misc
|
||||
5-7-1. perf_event
|
||||
5-5. Cpuset
|
||||
5.5-1. Cpuset Interface Files
|
||||
5-6. Device
|
||||
5-7. RDMA
|
||||
5-7-1. RDMA Interface Files
|
||||
5-8. Misc
|
||||
5-8-1. perf_event
|
||||
5-N. Non-normative information
|
||||
5-N-1. CPU controller root cgroup process behaviour
|
||||
5-N-2. IO controller root cgroup process behaviour
|
||||
@ -1610,6 +1612,176 @@ through fork() or clone(). These will return -EAGAIN if the creation
|
||||
of a new process would cause a cgroup policy to be violated.
|
||||
|
||||
|
||||
Cpuset
|
||||
------
|
||||
|
||||
The "cpuset" controller provides a mechanism for constraining
|
||||
the CPU and memory node placement of tasks to only the resources
|
||||
specified in the cpuset interface files in a task's current cgroup.
|
||||
This is especially valuable on large NUMA systems where placing jobs
|
||||
on properly sized subsets of the systems with careful processor and
|
||||
memory placement to reduce cross-node memory access and contention
|
||||
can improve overall system performance.
|
||||
|
||||
The "cpuset" controller is hierarchical. That means the controller
|
||||
cannot use CPUs or memory nodes not allowed in its parent.
|
||||
|
||||
|
||||
Cpuset Interface Files
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
cpuset.cpus
|
||||
A read-write multiple values file which exists on non-root
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the requested CPUs to be used by tasks within this
|
||||
cgroup. The actual list of CPUs to be granted, however, is
|
||||
subjected to constraints imposed by its parent and can differ
|
||||
from the requested CPUs.
|
||||
|
||||
The CPU numbers are comma-separated numbers or ranges.
|
||||
For example:
|
||||
|
||||
# cat cpuset.cpus
|
||||
0-4,6,8-10
|
||||
|
||||
An empty value indicates that the cgroup is using the same
|
||||
setting as the nearest cgroup ancestor with a non-empty
|
||||
"cpuset.cpus" or all the available CPUs if none is found.
|
||||
|
||||
The value of "cpuset.cpus" stays constant until the next update
|
||||
and won't be affected by any CPU hotplug events.
|
||||
|
||||
cpuset.cpus.effective
|
||||
A read-only multiple values file which exists on all
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the onlined CPUs that are actually granted to this
|
||||
cgroup by its parent. These CPUs are allowed to be used by
|
||||
tasks within the current cgroup.
|
||||
|
||||
If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows
|
||||
all the CPUs from the parent cgroup that can be available to
|
||||
be used by this cgroup. Otherwise, it should be a subset of
|
||||
"cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus"
|
||||
can be granted. In this case, it will be treated just like an
|
||||
empty "cpuset.cpus".
|
||||
|
||||
Its value will be affected by CPU hotplug events.
|
||||
|
||||
cpuset.mems
|
||||
A read-write multiple values file which exists on non-root
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the requested memory nodes to be used by tasks within
|
||||
this cgroup. The actual list of memory nodes granted, however,
|
||||
is subjected to constraints imposed by its parent and can differ
|
||||
from the requested memory nodes.
|
||||
|
||||
The memory node numbers are comma-separated numbers or ranges.
|
||||
For example:
|
||||
|
||||
# cat cpuset.mems
|
||||
0-1,3
|
||||
|
||||
An empty value indicates that the cgroup is using the same
|
||||
setting as the nearest cgroup ancestor with a non-empty
|
||||
"cpuset.mems" or all the available memory nodes if none
|
||||
is found.
|
||||
|
||||
The value of "cpuset.mems" stays constant until the next update
|
||||
and won't be affected by any memory nodes hotplug events.
|
||||
|
||||
cpuset.mems.effective
|
||||
A read-only multiple values file which exists on all
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the onlined memory nodes that are actually granted to
|
||||
this cgroup by its parent. These memory nodes are allowed to
|
||||
be used by tasks within the current cgroup.
|
||||
|
||||
If "cpuset.mems" is empty, it shows all the memory nodes from the
|
||||
parent cgroup that will be available to be used by this cgroup.
|
||||
Otherwise, it should be a subset of "cpuset.mems" unless none of
|
||||
the memory nodes listed in "cpuset.mems" can be granted. In this
|
||||
case, it will be treated just like an empty "cpuset.mems".
|
||||
|
||||
Its value will be affected by memory nodes hotplug events.
|
||||
|
||||
cpuset.cpus.partition
|
||||
A read-write single value file which exists on non-root
|
||||
cpuset-enabled cgroups. This flag is owned by the parent cgroup
|
||||
and is not delegatable.
|
||||
|
||||
It accepts only the following input values when written to.
|
||||
|
||||
"root" - a paritition root
|
||||
"member" - a non-root member of a partition
|
||||
|
||||
When set to be a partition root, the current cgroup is the
|
||||
root of a new partition or scheduling domain that comprises
|
||||
itself and all its descendants except those that are separate
|
||||
partition roots themselves and their descendants. The root
|
||||
cgroup is always a partition root.
|
||||
|
||||
There are constraints on where a partition root can be set.
|
||||
It can only be set in a cgroup if all the following conditions
|
||||
are true.
|
||||
|
||||
1) The "cpuset.cpus" is not empty and the list of CPUs are
|
||||
exclusive, i.e. they are not shared by any of its siblings.
|
||||
2) The parent cgroup is a partition root.
|
||||
3) The "cpuset.cpus" is also a proper subset of the parent's
|
||||
"cpuset.cpus.effective".
|
||||
4) There is no child cgroups with cpuset enabled. This is for
|
||||
eliminating corner cases that have to be handled if such a
|
||||
condition is allowed.
|
||||
|
||||
Setting it to partition root will take the CPUs away from the
|
||||
effective CPUs of the parent cgroup. Once it is set, this
|
||||
file cannot be reverted back to "member" if there are any child
|
||||
cgroups with cpuset enabled.
|
||||
|
||||
A parent partition cannot distribute all its CPUs to its
|
||||
child partitions. There must be at least one cpu left in the
|
||||
parent partition.
|
||||
|
||||
Once becoming a partition root, changes to "cpuset.cpus" is
|
||||
generally allowed as long as the first condition above is true,
|
||||
the change will not take away all the CPUs from the parent
|
||||
partition and the new "cpuset.cpus" value is a superset of its
|
||||
children's "cpuset.cpus" values.
|
||||
|
||||
Sometimes, external factors like changes to ancestors'
|
||||
"cpuset.cpus" or cpu hotplug can cause the state of the partition
|
||||
root to change. On read, the "cpuset.sched.partition" file
|
||||
can show the following values.
|
||||
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"root invalid" Invalid partition root
|
||||
|
||||
It is a partition root if the first 2 partition root conditions
|
||||
above are true and at least one CPU from "cpuset.cpus" is
|
||||
granted by the parent cgroup.
|
||||
|
||||
A partition root can become invalid if none of CPUs requested
|
||||
in "cpuset.cpus" can be granted by the parent cgroup or the
|
||||
parent cgroup is no longer a partition root itself. In this
|
||||
case, it is not a real partition even though the restriction
|
||||
of the first partition root condition above will still apply.
|
||||
The cpu affinity of all the tasks in the cgroup will then be
|
||||
associated with CPUs in the nearest ancestor partition.
|
||||
|
||||
An invalid partition root can be transitioned back to a
|
||||
real partition root if at least one of the requested CPUs
|
||||
can now be granted by its parent. In this case, the cpu
|
||||
affinity of all the tasks in the formerly invalid partition
|
||||
will be associated to the CPUs of the newly formed partition.
|
||||
Changing the partition state of an invalid partition root to
|
||||
"member" is always allowed even if child cpusets are present.
|
||||
|
||||
|
||||
Device controller
|
||||
-----------------
|
||||
|
||||
|
@ -486,10 +486,14 @@
|
||||
cut the overhead, others just disable the usage. So
|
||||
only cgroup_disable=memory is actually worthy}
|
||||
|
||||
cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
|
||||
Format: { controller[,controller...] | "all" }
|
||||
cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1
|
||||
Format: { { controller | "all" | "named" }
|
||||
[,{ controller | "all" | "named" }...] }
|
||||
Like cgroup_disable, but only applies to cgroup v1;
|
||||
the blacklisted controllers remain available in cgroup2.
|
||||
"all" blacklists all controllers and "named" disables
|
||||
named mounts. Specifying both "all" and "named" disables
|
||||
all v1 hierarchies.
|
||||
|
||||
cgroup.memory= [KNL] Pass options to the cgroup memory controller.
|
||||
Format: <string>
|
||||
|
@ -92,6 +92,7 @@ enum {
|
||||
|
||||
CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
|
||||
CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
|
||||
CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */
|
||||
|
||||
/* internal flags, do not use outside cgroup core proper */
|
||||
__CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
|
||||
|
@ -11,6 +11,8 @@
|
||||
#define TRACE_CGROUP_PATH_LEN 1024
|
||||
extern spinlock_t trace_cgroup_path_lock;
|
||||
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
|
||||
extern bool cgroup_debug;
|
||||
extern void __init enable_debug_cgroup(void);
|
||||
|
||||
/*
|
||||
* cgroup_path() takes a spin lock. It is good practice not to take
|
||||
|
@ -27,6 +27,9 @@
|
||||
/* Controllers blocked by the commandline in v1 */
|
||||
static u16 cgroup_no_v1_mask;
|
||||
|
||||
/* disable named v1 mounts */
|
||||
static bool cgroup_no_v1_named;
|
||||
|
||||
/*
|
||||
* pidlist destructions need to be flushed on cgroup destruction. Use a
|
||||
* separate workqueue as flush domain.
|
||||
@ -963,6 +966,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
|
||||
}
|
||||
if (!strncmp(token, "name=", 5)) {
|
||||
const char *name = token + 5;
|
||||
|
||||
/* blocked by boot param? */
|
||||
if (cgroup_no_v1_named)
|
||||
return -ENOENT;
|
||||
/* Can't specify an empty name */
|
||||
if (!strlen(name))
|
||||
return -EINVAL;
|
||||
@ -1292,7 +1299,12 @@ static int __init cgroup_no_v1(char *str)
|
||||
|
||||
if (!strcmp(token, "all")) {
|
||||
cgroup_no_v1_mask = U16_MAX;
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strcmp(token, "named")) {
|
||||
cgroup_no_v1_named = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
for_each_subsys(ss, i) {
|
||||
|
@ -86,6 +86,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
|
||||
|
||||
DEFINE_SPINLOCK(trace_cgroup_path_lock);
|
||||
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
|
||||
bool cgroup_debug __read_mostly;
|
||||
|
||||
/*
|
||||
* Protects cgroup_idr and css_idr so that IDs can be released without
|
||||
@ -1429,12 +1430,15 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
|
||||
struct cgroup_subsys *ss = cft->ss;
|
||||
|
||||
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
|
||||
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
|
||||
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
|
||||
cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
|
||||
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
|
||||
const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
|
||||
|
||||
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
|
||||
dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
|
||||
cft->name);
|
||||
else
|
||||
} else {
|
||||
strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
@ -1774,7 +1778,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
|
||||
|
||||
*root_flags = 0;
|
||||
|
||||
if (!data)
|
||||
if (!data || *data == '\0')
|
||||
return 0;
|
||||
|
||||
while ((token = strsep(&data, ",")) != NULL) {
|
||||
@ -3669,7 +3673,8 @@ restart:
|
||||
continue;
|
||||
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
|
||||
continue;
|
||||
|
||||
if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
|
||||
continue;
|
||||
if (is_add) {
|
||||
ret = cgroup_add_file(css, cgrp, cft);
|
||||
if (ret) {
|
||||
@ -4232,20 +4237,25 @@ static void css_task_iter_advance(struct css_task_iter *it)
|
||||
|
||||
lockdep_assert_held(&css_set_lock);
|
||||
repeat:
|
||||
/*
|
||||
* Advance iterator to find next entry. cset->tasks is consumed
|
||||
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
|
||||
* next cset.
|
||||
*/
|
||||
next = it->task_pos->next;
|
||||
if (it->task_pos) {
|
||||
/*
|
||||
* Advance iterator to find next entry. cset->tasks is
|
||||
* consumed first and then ->mg_tasks. After ->mg_tasks,
|
||||
* we move onto the next cset.
|
||||
*/
|
||||
next = it->task_pos->next;
|
||||
|
||||
if (next == it->tasks_head)
|
||||
next = it->mg_tasks_head->next;
|
||||
if (next == it->tasks_head)
|
||||
next = it->mg_tasks_head->next;
|
||||
|
||||
if (next == it->mg_tasks_head)
|
||||
if (next == it->mg_tasks_head)
|
||||
css_task_iter_advance_css_set(it);
|
||||
else
|
||||
it->task_pos = next;
|
||||
} else {
|
||||
/* called from start, proceed to the first cset */
|
||||
css_task_iter_advance_css_set(it);
|
||||
else
|
||||
it->task_pos = next;
|
||||
}
|
||||
|
||||
/* if PROCS, skip over tasks which aren't group leaders */
|
||||
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
|
||||
@ -4285,7 +4295,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
|
||||
|
||||
it->cset_head = it->cset_pos;
|
||||
|
||||
css_task_iter_advance_css_set(it);
|
||||
css_task_iter_advance(it);
|
||||
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
}
|
||||
@ -5773,6 +5783,16 @@ static int __init cgroup_disable(char *str)
|
||||
}
|
||||
__setup("cgroup_disable=", cgroup_disable);
|
||||
|
||||
void __init __weak enable_debug_cgroup(void) { }
|
||||
|
||||
static int __init enable_cgroup_debug(char *str)
|
||||
{
|
||||
cgroup_debug = true;
|
||||
enable_debug_cgroup();
|
||||
return 1;
|
||||
}
|
||||
__setup("cgroup_debug", enable_cgroup_debug);
|
||||
|
||||
/**
|
||||
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
|
||||
* @dentry: directory dentry of interest
|
||||
@ -6008,10 +6028,8 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
|
||||
|
||||
ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
|
||||
|
||||
if (unlikely(ret >= size)) {
|
||||
WARN_ON(1);
|
||||
if (WARN_ON(ret >= size))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -373,11 +373,9 @@ struct cgroup_subsys debug_cgrp_subsys = {
|
||||
* On v2, debug is an implicit controller enabled by "cgroup_debug" boot
|
||||
* parameter.
|
||||
*/
|
||||
static int __init enable_cgroup_debug(char *str)
|
||||
void __init enable_debug_cgroup(void)
|
||||
{
|
||||
debug_cgrp_subsys.dfl_cftypes = debug_files;
|
||||
debug_cgrp_subsys.implicit_on_dfl = true;
|
||||
debug_cgrp_subsys.threaded = true;
|
||||
return 1;
|
||||
}
|
||||
__setup("cgroup_debug", enable_cgroup_debug);
|
||||
|
Loading…
Reference in New Issue
Block a user