From d6326047576266991d88639e1e9739a9a9a20ef4 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 24 Jul 2024 10:24:18 +0000 Subject: [PATCH 01/34] cgroup/cpuset: remove child_ecpus_count The child_ecpus_count variable was previously used to update sibling cpumask when parent's effective_cpus is updated. However, it became obsolete after commit e2ffe502ba45 ("cgroup/cpuset: Add cpuset.cpus.exclusive for v2"). It should be removed. tj: Restored {} for style consistency. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 40ec4abaf440..918268bc03a7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -188,10 +188,8 @@ struct cpuset { /* * Default hierarchy only: * use_parent_ecpus - set if using parent's effective_cpus - * child_ecpus_count - # of children with use_parent_ecpus set */ int use_parent_ecpus; - int child_ecpus_count; /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we @@ -1512,7 +1510,6 @@ static void reset_partition_data(struct cpuset *cs) if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) { cs->use_parent_ecpus = true; - parent->child_ecpus_count++; cpumask_copy(cs->effective_cpus, parent->effective_cpus); } } @@ -1688,12 +1685,8 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - + if (cs->use_parent_ecpus) cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -2318,14 +2311,10 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, */ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (!cp->use_parent_ecpus) { + if (!cp->use_parent_ecpus) cp->use_parent_ecpus = true; - parent->child_ecpus_count++; - } } else if (cp->use_parent_ecpus) { cp->use_parent_ecpus = false; - WARN_ON_ONCE(!parent->child_ecpus_count); - parent->child_ecpus_count--; } if (remote) @@ -4139,7 +4128,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; - parent->child_ecpus_count++; } spin_unlock_irq(&callback_lock); @@ -4205,12 +4193,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - + if (cs->use_parent_ecpus) cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); From d72a00a8485d1cb11ac1a57bf89b02cbd3a405bf Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 30 Jul 2024 03:29:20 +0000 Subject: [PATCH 02/34] cgroup/pids: Avoid spurious event notification Currently when a process in a group forks and fails due to it's parent's max restriction, all the cgroups from 'pids_forking' to root will generate event notifications but only the cgroups from 'pids_over_limit' to root will increase the counter of PIDCG_MAX. Consider this scenario: there are 4 groups A, B, C,and D, the relationships are as follows, and user is watching on C.pids.events. root->A->B->C->D When a process in D forks and fails due to B.max restriction, the user will get a spurious event notification because when he wakes up and reads C.pids.events, he will find that the content has not changed. To address this issue, only the cgroups from 'pids_over_limit' to root will have their PIDCG_MAX counters increased and event notifications generated. Fixes: 385a635cacfe ("cgroup/pids: Make event counters hierarchical") Signed-off-by: Xiu Jianfeng Signed-off-by: Tejun Heo --- kernel/cgroup/pids.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index f5cb0ec45b9d..34aa63d7c9c6 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -244,7 +244,6 @@ static void pids_event(struct pids_cgroup *pids_forking, struct pids_cgroup *pids_over_limit) { struct pids_cgroup *p = pids_forking; - bool limit = false; /* Only log the first time limit is hit. */ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) { @@ -252,20 +251,17 @@ static void pids_event(struct pids_cgroup *pids_forking, pr_cont_cgroup_path(p->css.cgroup); pr_cont("\n"); } - cgroup_file_notify(&p->events_local_file); if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || - cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + cgroup_file_notify(&p->events_local_file); return; + } - for (; parent_pids(p); p = parent_pids(p)) { - if (p == pids_over_limit) { - limit = true; - atomic64_inc(&p->events_local[PIDCG_MAX]); - cgroup_file_notify(&p->events_local_file); - } - if (limit) - atomic64_inc(&p->events[PIDCG_MAX]); + atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]); + cgroup_file_notify(&pids_over_limit->events_local_file); + for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) { + atomic64_inc(&p->events[PIDCG_MAX]); cgroup_file_notify(&p->events_file); } } From c149c4a48b19afbf0c383614e57b452d39b154de Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 13 Jul 2024 08:59:16 +0000 Subject: [PATCH 03/34] cgroup/cpuset: Remove cpuset_slab_spread_rotor Since the SLAB implementation was removed in v6.8, so the cpuset_slab_spread_rotor is no longer used and can be removed. Signed-off-by: Xiu Jianfeng Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 6 ------ include/linux/sched.h | 1 - kernel/cgroup/cpuset.c | 13 ------------- kernel/fork.c | 1 - 4 files changed, 21 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index de4cf0ee96f7..2a6981eeebf8 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -113,7 +113,6 @@ extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); extern int cpuset_mem_spread_node(void); -extern int cpuset_slab_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { @@ -246,11 +245,6 @@ static inline int cpuset_mem_spread_node(void) return 0; } -static inline int cpuset_slab_spread_node(void) -{ - return 0; -} - static inline int cpuset_do_page_mem_spread(void) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index f8d150343d42..3773c1c8f099 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,7 +1243,6 @@ struct task_struct { /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; - int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 918268bc03a7..9f9e626dabb0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -5012,19 +5012,6 @@ int cpuset_mem_spread_node(void) return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } -/** - * cpuset_slab_spread_node() - On which node to begin search for a slab page - */ -int cpuset_slab_spread_node(void) -{ - if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) - current->cpuset_slab_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); -} -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - /** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. diff --git a/kernel/fork.c b/kernel/fork.c index cc760491f201..6ed634f09621 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2311,7 +2311,6 @@ __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; - p->cpuset_slab_spread_rotor = NUMA_NO_NODE; seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS From 4a711dd910d0135b3d262f85612f7e17e0feb989 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 26 Jul 2024 01:05:02 +0000 Subject: [PATCH 04/34] cgroup/cpuset: add decrease attach_in_progress helpers There are several functions to decrease attach_in_progress, and they will wake up cpuset_attach_wq when attach_in_progress is zero. So, add a helper to make it concise. Signed-off-by: Chen Ridong Reviewed-by: Kamalesh Babulal Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 9f9e626dabb0..31aefc3e1a6a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -490,6 +490,26 @@ static inline void check_insane_mems_config(nodemask_t *nodes) } } +/* + * decrease cs->attach_in_progress. + * wake_up cpuset_attach_wq if cs->attach_in_progress==0. + */ +static inline void dec_attach_in_progress_locked(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +} + +static inline void dec_attach_in_progress(struct cpuset *cs) +{ + mutex_lock(&cpuset_mutex); + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); +} + /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting @@ -3422,9 +3442,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cs = css_cs(css); mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); + dec_attach_in_progress_locked(cs); if (cs->nr_migrate_dl_tasks) { int cpu = cpumask_any(cs->effective_cpus); @@ -3539,9 +3557,7 @@ out: reset_migrate_dl_data(cs); } - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); + dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } @@ -4284,11 +4300,7 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) if (same_cs) return; - mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - mutex_unlock(&cpuset_mutex); + dec_attach_in_progress(cs); } /* @@ -4320,10 +4332,7 @@ static void cpuset_fork(struct task_struct *task) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - + dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } From 93c8332c8373fee415bd79f08d5ba4ba7ca5ad15 Mon Sep 17 00:00:00 2001 From: Xavier Date: Thu, 4 Jul 2024 14:24:43 +0800 Subject: [PATCH 05/34] Union-Find: add a new module in kernel library This patch implements a union-find data structure in the kernel library, which includes operations for allocating nodes, freeing nodes, finding the root of a node, and merging two nodes. Signed-off-by: Xavier Signed-off-by: Tejun Heo --- Documentation/core-api/union_find.rst | 102 ++++++++++++++++++ .../zh_CN/core-api/union_find.rst | 87 +++++++++++++++ MAINTAINERS | 9 ++ include/linux/union_find.h | 41 +++++++ lib/Makefile | 2 +- lib/union_find.c | 49 +++++++++ 6 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 Documentation/core-api/union_find.rst create mode 100644 Documentation/translations/zh_CN/core-api/union_find.rst create mode 100644 include/linux/union_find.h create mode 100644 lib/union_find.c diff --git a/Documentation/core-api/union_find.rst b/Documentation/core-api/union_find.rst new file mode 100644 index 000000000000..2bf0290c9184 --- /dev/null +++ b/Documentation/core-api/union_find.rst @@ -0,0 +1,102 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Union-Find in Linux +==================== + + +:Date: June 21, 2024 +:Author: Xavier + +What is union-find, and what is it used for? +------------------------------------------------ + +Union-find is a data structure used to handle the merging and querying +of disjoint sets. The primary operations supported by union-find are: + + Initialization: Resetting each element as an individual set, with + each set's initial parent node pointing to itself. + Find: Determine which set a particular element belongs to, usually by + returning a “representative element” of that set. This operation + is used to check if two elements are in the same set. + Union: Merge two sets into one. + +As a data structure used to maintain sets (groups), union-find is commonly +utilized to solve problems related to offline queries, dynamic connectivity, +and graph theory. It is also a key component in Kruskal's algorithm for +computing the minimum spanning tree, which is crucial in scenarios like +network routing. Consequently, union-find is widely referenced. Additionally, +union-find has applications in symbolic computation, register allocation, +and more. + +Space Complexity: O(n), where n is the number of nodes. + +Time Complexity: Using path compression can reduce the time complexity of +the find operation, and using union by rank can reduce the time complexity +of the union operation. These optimizations reduce the average time +complexity of each find and union operation to O(α(n)), where α(n) is the +inverse Ackermann function. This can be roughly considered a constant time +complexity for practical purposes. + +This document covers use of the Linux union-find implementation. For more +information on the nature and implementation of union-find, see: + + Wikipedia entry on union-find + https://en.wikipedia.org/wiki/Disjoint-set_data_structure + +Linux implementation of union-find +----------------------------------- + +Linux's union-find implementation resides in the file "lib/union_find.c". +To use it, "#include ". + +The union-find data structure is defined as follows:: + + struct uf_node { + struct uf_node *parent; + unsigned int rank; + }; + +In this structure, parent points to the parent node of the current node. +The rank field represents the height of the current tree. During a union +operation, the tree with the smaller rank is attached under the tree with the +larger rank to maintain balance. + +Initializing union-find +-------------------- + +You can complete the initialization using either static or initialization +interface. Initialize the parent pointer to point to itself and set the rank +to 0. +Example:: + + struct uf_node my_node = UF_INIT_NODE(my_node); +or + uf_node_init(&my_node); + +Find the Root Node of union-find +-------------------------------- + +This operation is mainly used to determine whether two nodes belong to the same +set in the union-find. If they have the same root, they are in the same set. +During the find operation, path compression is performed to improve the +efficiency of subsequent find operations. +Example:: + + int connected; + struct uf_node *root1 = uf_find(&node_1); + struct uf_node *root2 = uf_find(&node_2); + if (root1 == root2) + connected = 1; + else + connected = 0; + +Union Two Sets in union-find +---------------------------- + +To union two sets in the union-find, you first find their respective root nodes +and then link the smaller node to the larger node based on the rank of the root +nodes. +Example:: + + uf_union(&node_1, &node_2); diff --git a/Documentation/translations/zh_CN/core-api/union_find.rst b/Documentation/translations/zh_CN/core-api/union_find.rst new file mode 100644 index 000000000000..a56de57147e9 --- /dev/null +++ b/Documentation/translations/zh_CN/core-api/union_find.rst @@ -0,0 +1,87 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/core-api/union_find.rst + +=========================== +Linux中的并查集(Union-Find) +=========================== + + +:日期: 2024年6月21日 +:作者: Xavier + +何为并查集,它有什么用? +--------------------- + +并查集是一种数据结构,用于处理一些不交集的合并及查询问题。并查集支持的主要操作: + 初始化:将每个元素初始化为单独的集合,每个集合的初始父节点指向自身 + 查询:查询某个元素属于哪个集合,通常是返回集合中的一个“代表元素”。这个操作是为 + 了判断两个元素是否在同一个集合之中。 + 合并:将两个集合合并为一个。 + +并查集作为一种用于维护集合(组)的数据结构,它通常用于解决一些离线查询、动态连通性和 +图论等相关问题,同时也是用于计算最小生成树的克鲁斯克尔算法中的关键,由于最小生成树在 +网络路由等场景下十分重要,并查集也得到了广泛的引用。此外,并查集在符号计算,寄存器分 +配等方面也有应用。 + +空间复杂度: O(n),n为节点数。 + +时间复杂度:使用路径压缩可以减少查找操作的时间复杂度,使用按秩合并可以减少合并操作的 +时间复杂度,使得并查集每个查询和合并操作的平均时间复杂度仅为O(α(n)),其中α(n)是反阿 +克曼函数,可以粗略地认为并查集的操作有常数的时间复杂度。 + +本文档涵盖了对Linux并查集实现的使用方法。更多关于并查集的性质和实现的信息,参见: + + 维基百科并查集词条 + https://en.wikipedia.org/wiki/Disjoint-set_data_structure + +并查集的Linux实现 +---------------- + +Linux的并查集实现在文件“lib/union_find.c”中。要使用它,需要 +“#include ”。 + +并查集的数据结构定义如下:: + + struct uf_node { + struct uf_node *parent; + unsigned int rank; + }; +其中parent为当前节点的父节点,rank为当前树的高度,在合并时将rank小的节点接到rank大 +的节点下面以增加平衡性。 + +初始化并查集 +--------- + +可以采用静态或初始化接口完成初始化操作。初始化时,parent 指针指向自身,rank 设置 +为 0。 +示例:: + + struct uf_node my_node = UF_INIT_NODE(my_node); +或 + uf_node_init(&my_node); + +查找并查集的根节点 +---------------- + +主要用于判断两个并查集是否属于一个集合,如果根相同,那么他们就是一个集合。在查找过程中 +会对路径进行压缩,提高后续查找效率。 +示例:: + + int connected; + struct uf_node *root1 = uf_find(&node_1); + struct uf_node *root2 = uf_find(&node_2); + if (root1 == root2) + connected = 1; + else + connected = 0; + +合并两个并查集 +------------- + +对于两个相交的并查集进行合并,会首先查找它们各自的根节点,然后根据根节点秩大小,将小的 +节点连接到大的节点下面。 +示例:: + + uf_union(&node_1, &node_2); diff --git a/MAINTAINERS b/MAINTAINERS index 42decde38320..82e3924816d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23458,6 +23458,15 @@ F: drivers/cdrom/cdrom.c F: include/linux/cdrom.h F: include/uapi/linux/cdrom.h +UNION-FIND +M: Xavier +L: linux-kernel@vger.kernel.org +S: Maintained +F: Documentation/core-api/union_find.rst +F: Documentation/translations/zh_CN/core-api/union_find.rst +F: include/linux/union_find.h +F: lib/union_find.c + UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER R: Alim Akhtar R: Avri Altman diff --git a/include/linux/union_find.h b/include/linux/union_find.h new file mode 100644 index 000000000000..cfd49263c138 --- /dev/null +++ b/include/linux/union_find.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_UNION_FIND_H +#define __LINUX_UNION_FIND_H +/** + * union_find.h - union-find data structure implementation + * + * This header provides functions and structures to implement the union-find + * data structure. The union-find data structure is used to manage disjoint + * sets and supports efficient union and find operations. + * + * See Documentation/core-api/union_find.rst for documentation and samples. + */ + +struct uf_node { + struct uf_node *parent; + unsigned int rank; +}; + +/* This macro is used for static initialization of a union-find node. */ +#define UF_INIT_NODE(node) {.parent = &node, .rank = 0} + +/** + * uf_node_init - Initialize a union-find node + * @node: pointer to the union-find node to be initialized + * + * This function sets the parent of the node to itself and + * initializes its rank to 0. + */ +static inline void uf_node_init(struct uf_node *node) +{ + node->parent = node; + node->rank = 0; +} + +/* find the root of a node */ +struct uf_node *uf_find(struct uf_node *node); + +/* Merge two intersecting nodes */ +void uf_union(struct uf_node *node1, struct uf_node *node2); + +#endif /* __LINUX_UNION_FIND_H */ diff --git a/lib/Makefile b/lib/Makefile index 322bb127b4dc..a5e3c1d5b6f9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -34,7 +34,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ - buildid.o objpool.o + buildid.o objpool.o union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/union_find.c b/lib/union_find.c new file mode 100644 index 000000000000..413b0f8adf7a --- /dev/null +++ b/lib/union_find.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +/** + * uf_find - Find the root of a node and perform path compression + * @node: the node to find the root of + * + * This function returns the root of the node by following the parent + * pointers. It also performs path compression, making the tree shallower. + * + * Returns the root node of the set containing node. + */ +struct uf_node *uf_find(struct uf_node *node) +{ + struct uf_node *parent; + + while (node->parent != node) { + parent = node->parent; + node->parent = parent->parent; + node = parent; + } + return node; +} + +/** + * uf_union - Merge two sets, using union by rank + * @node1: the first node + * @node2: the second node + * + * This function merges the sets containing node1 and node2, by comparing + * the ranks to keep the tree balanced. + */ +void uf_union(struct uf_node *node1, struct uf_node *node2) +{ + struct uf_node *root1 = uf_find(node1); + struct uf_node *root2 = uf_find(node2); + + if (root1 == root2) + return; + + if (root1->rank < root2->rank) { + root1->parent = root2; + } else if (root1->rank > root2->rank) { + root2->parent = root1; + } else { + root2->parent = root1; + root1->rank++; + } +} From 8a895c2e6a7ed264a1b917616db205ed934e8306 Mon Sep 17 00:00:00 2001 From: Xavier Date: Thu, 4 Jul 2024 14:24:44 +0800 Subject: [PATCH 06/34] cpuset: use Union-Find to optimize the merging of cpumasks The process of constructing scheduling domains involves multiple loops and repeated evaluations, leading to numerous redundant and ineffective assessments that impact code efficiency. Here, we use union-find to optimize the merging of cpumasks. By employing path compression and union by rank, we effectively reduce the number of lookups and merge comparisons. Signed-off-by: Xavier Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 112 ++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 69 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 31aefc3e1a6a..9066f9b4af24 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -46,6 +46,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -173,9 +174,6 @@ struct cpuset { */ int attach_in_progress; - /* partition number for rebuild_sched_domains() */ - int pn; - /* for custom sched domain */ int relax_domain_level; @@ -207,6 +205,9 @@ struct cpuset { /* Remote partition silbling list anchored at remote_children */ struct list_head remote_sibling; + + /* Used to merge intersecting subsets for generate_sched_domains */ + struct uf_node node; }; /* @@ -1007,18 +1008,15 @@ static inline int nr_cpusets(void) * were changed (added or removed.) * * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) @@ -1026,7 +1024,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ + int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ @@ -1034,6 +1032,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + int nslot_update; doms = NULL; dattr = NULL; @@ -1121,31 +1120,25 @@ v2: if (root_load_balance && (csn == 1)) goto single_root_domain; - for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; + if (!cgrpv2) { + for (i = 0; i < csn; i++) + uf_node_init(&csa[i]->node); -restart: - /* Find the best partition (set of sched domains) */ - for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; + /* Merge overlapping cpusets */ + for (i = 0; i < csn; i++) { + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) + uf_union(&csa[i]->node, &csa[j]->node); } } + + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + } else { + ndoms = csn; } /* @@ -1178,44 +1171,25 @@ restart: } for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - struct cpumask *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms[nslot]; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; - } - - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; + nslot_update = 0; for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; - if (apn == b->pn) { - cpumask_or(dp, dp, b->effective_cpus); + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; + update_domain_attr_tree(dattr + nslot, csa[j]); } } - nslot++; + if (nslot_update) + nslot++; } BUG_ON(nslot != ndoms); From ab03125268679e058e1e7b6612f6d12610761769 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 Jul 2024 11:00:34 -0400 Subject: [PATCH 07/34] cgroup: Show # of subsystem CSSes in cgroup.stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cgroup subsystem state (CSS) is an abstraction in the cgroup layer to help manage different structures in various cgroup subsystems by being an embedded element inside a larger structure like cpuset or mem_cgroup. The /proc/cgroups file shows the number of cgroups for each of the subsystems. With cgroup v1, the number of CSSes is the same as the number of cgroups. That is not the case anymore with cgroup v2. The /proc/cgroups file cannot show the actual number of CSSes for the subsystems that are bound to cgroup v2. So if a v2 cgroup subsystem is leaking cgroups (usually memory cgroup), we can't tell by looking at /proc/cgroups which cgroup subsystems may be responsible. As cgroup v2 had deprecated the use of /proc/cgroups, the hierarchical cgroup.stat file is now being extended to show the number of live and dying CSSes associated with all the non-inhibited cgroup subsystems that have been bound to cgroup v2. The number includes CSSes in the current cgroup as well as in all the descendants underneath it. This will help us pinpoint which subsystems are responsible for the increasing number of dying (nr_dying_descendants) cgroups. The CSSes dying counts are stored in the cgroup structure itself instead of inside the CSS as suggested by Johannes. This will allow us to accurately track dying counts of cgroup subsystems that have recently been disabled in a cgroup. It is now possible that a zero subsystem number is coupled with a non-zero dying subsystem number. The cgroup-v2.rst file is updated to discuss this new behavior. With this patch applied, a sample output from root cgroup.stat file was shown below. nr_descendants 56 nr_subsys_cpuset 1 nr_subsys_cpu 43 nr_subsys_io 43 nr_subsys_memory 56 nr_subsys_perf_event 57 nr_subsys_hugetlb 1 nr_subsys_pids 56 nr_subsys_rdma 1 nr_subsys_misc 1 nr_dying_descendants 30 nr_dying_subsys_cpuset 0 nr_dying_subsys_cpu 0 nr_dying_subsys_io 0 nr_dying_subsys_memory 30 nr_dying_subsys_perf_event 0 nr_dying_subsys_hugetlb 0 nr_dying_subsys_pids 0 nr_dying_subsys_rdma 0 nr_dying_subsys_misc 0 Another sample output from system.slice/cgroup.stat was: nr_descendants 34 nr_subsys_cpuset 0 nr_subsys_cpu 32 nr_subsys_io 32 nr_subsys_memory 34 nr_subsys_perf_event 35 nr_subsys_hugetlb 0 nr_subsys_pids 34 nr_subsys_rdma 0 nr_subsys_misc 0 nr_dying_descendants 30 nr_dying_subsys_cpuset 0 nr_dying_subsys_cpu 0 nr_dying_subsys_io 0 nr_dying_subsys_memory 30 nr_dying_subsys_perf_event 0 nr_dying_subsys_hugetlb 0 nr_dying_subsys_pids 0 nr_dying_subsys_rdma 0 nr_dying_subsys_misc 0 Note that 'debug' controller wasn't used to provide this information because the controller is not recommended in productions kernels, also many of them won't enable CONFIG_CGROUP_DEBUG by default. Similar information could be retrieved with debuggers like drgn but that's also not always available (e.g. lockdown) and the additional cost of runtime tracking here is deemed marginal. tj: Added Michal's paragraphs on why this is not added the debug controller to the commit message. Signed-off-by: Waiman Long Acked-by: Johannes Weiner Acked-by: Roman Gushchin Reviewed-by: Kamalesh Babulal Cc: Michal Koutný Link: http://lkml.kernel.org/r/20240715150034.2583772-1-longman@redhat.com Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 12 +++++- include/linux/cgroup-defs.h | 14 +++++++ kernel/cgroup/cgroup.c | 55 ++++++++++++++++++++++++- 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 86311c2907cd..70cefccd07ce 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -981,6 +981,14 @@ All cgroup core files are prefixed with "cgroup." A dying cgroup can consume system resources not exceeding limits, which were active at the moment of cgroup deletion. + nr_subsys_ + Total number of live cgroup subsystems (e.g memory + cgroup) at and beneath the current cgroup. + + nr_dying_subsys_ + Total number of dying cgroup subsystems (e.g. memory + cgroup) at and beneath the current cgroup. + cgroup.freeze A read-write single value file which exists on non-root cgroups. Allowed values are "0" and "1". The default is "0". @@ -2939,8 +2947,8 @@ Deprecated v1 Core Features - "cgroup.clone_children" is removed. -- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" file - at the root instead. +- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" or + "cgroup.stat" files at the root instead. Issues with v1 and Rationales for v2 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ae04035b6cbe..eb0f6f349496 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -210,6 +210,14 @@ struct cgroup_subsys_state { * fields of the containing structure. */ struct cgroup_subsys_state *parent; + + /* + * Keep track of total numbers of visible descendant CSSes. + * The total number of dying CSSes is tracked in + * css->cgroup->nr_dying_subsys[ssid]. + * Protected by cgroup_mutex. + */ + int nr_descendants; }; /* @@ -470,6 +478,12 @@ struct cgroup { /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; + /* + * Keep track of total number of dying CSSes at and below this cgroup. + * Protected by cgroup_mutex. + */ + int nr_dying_subsys[CGROUP_SUBSYS_COUNT]; + struct cgroup_root *root; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c8e4b62b436a..601600afdd20 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3669,12 +3669,40 @@ static int cgroup_events_show(struct seq_file *seq, void *v) static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; + struct cgroup_subsys_state *css; + int dying_cnt[CGROUP_SUBSYS_COUNT]; + int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); + + /* + * Show the number of live and dying csses associated with each of + * non-inhibited cgroup subsystems that is bound to cgroup v2. + * + * Without proper lock protection, racing is possible. So the + * numbers may not be consistent when that happens. + */ + rcu_read_lock(); + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + dying_cnt[ssid] = -1; + if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || + (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) + continue; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; + seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, + css ? (css->nr_descendants + 1) : 0); + } + seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); - + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + if (dying_cnt[ssid] >= 0) + seq_printf(seq, "nr_dying_subsys_%s %d\n", + cgroup_subsys[ssid]->name, dying_cnt[ssid]); + } + rcu_read_unlock(); return 0; } @@ -5424,6 +5452,8 @@ static void css_release_work_fn(struct work_struct *work) list_del_rcu(&css->sibling); if (ss) { + struct cgroup *parent_cgrp; + /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); @@ -5433,6 +5463,14 @@ static void css_release_work_fn(struct work_struct *work) cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); + + cgrp->nr_dying_subsys[ss->id]--; + WARN_ON_ONCE(css->nr_descendants || cgrp->nr_dying_subsys[ss->id]); + parent_cgrp = cgroup_parent(cgrp); + while (parent_cgrp) { + parent_cgrp->nr_dying_subsys[ss->id]--; + parent_cgrp = cgroup_parent(parent_cgrp); + } } else { struct cgroup *tcgrp; @@ -5517,8 +5555,11 @@ static int online_css(struct cgroup_subsys_state *css) rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); - if (css->parent) + if (css->parent) { atomic_inc(&css->parent->online_cnt); + while ((css = css->parent)) + css->nr_descendants++; + } } return ret; } @@ -5540,6 +5581,16 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /** From 563ea1f5f85171b68f9075f5c3c22c4b521f1b5e Mon Sep 17 00:00:00 2001 From: Xavier Date: Fri, 2 Aug 2024 11:33:46 +0800 Subject: [PATCH 08/34] Documentation: Fix the compilation errors in union_find.rst Fix the compilation errors and warnings caused by merging Documentation/core-api/union_find.rst and Documentation/translations/zh_CN/core-api/union_find.rst. Signed-off-by: Xavier Signed-off-by: Tejun Heo --- Documentation/core-api/index.rst | 1 + Documentation/core-api/union_find.rst | 6 +++++- .../translations/zh_CN/core-api/index.rst | 1 + .../zh_CN/core-api/union_find.rst | 21 ++++++++++++------- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index f147854700e4..e18a2ffe0787 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -49,6 +49,7 @@ Library functionality that is used throughout the kernel. wrappers/atomic_t wrappers/atomic_bitops floating-point + union_find Low level entry and exit ======================== diff --git a/Documentation/core-api/union_find.rst b/Documentation/core-api/union_find.rst index 2bf0290c9184..6df8b94fdb5a 100644 --- a/Documentation/core-api/union_find.rst +++ b/Documentation/core-api/union_find.rst @@ -16,9 +16,11 @@ of disjoint sets. The primary operations supported by union-find are: Initialization: Resetting each element as an individual set, with each set's initial parent node pointing to itself. + Find: Determine which set a particular element belongs to, usually by returning a “representative element” of that set. This operation is used to check if two elements are in the same set. + Union: Merge two sets into one. As a data structure used to maintain sets (groups), union-find is commonly @@ -63,7 +65,7 @@ operation, the tree with the smaller rank is attached under the tree with the larger rank to maintain balance. Initializing union-find --------------------- +----------------------- You can complete the initialization using either static or initialization interface. Initialize the parent pointer to point to itself and set the rank @@ -71,7 +73,9 @@ to 0. Example:: struct uf_node my_node = UF_INIT_NODE(my_node); + or + uf_node_init(&my_node); Find the Root Node of union-find diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst index 922cabf7b5dd..453a02cd1f44 100644 --- a/Documentation/translations/zh_CN/core-api/index.rst +++ b/Documentation/translations/zh_CN/core-api/index.rst @@ -49,6 +49,7 @@ generic-radix-tree packing this_cpu_ops + union_find ======= diff --git a/Documentation/translations/zh_CN/core-api/union_find.rst b/Documentation/translations/zh_CN/core-api/union_find.rst index a56de57147e9..bb93fa8c6533 100644 --- a/Documentation/translations/zh_CN/core-api/union_find.rst +++ b/Documentation/translations/zh_CN/core-api/union_find.rst @@ -3,21 +3,23 @@ :Original: Documentation/core-api/union_find.rst -=========================== +============================= Linux中的并查集(Union-Find) -=========================== +============================= :日期: 2024年6月21日 :作者: Xavier 何为并查集,它有什么用? ---------------------- +------------------------ 并查集是一种数据结构,用于处理一些不交集的合并及查询问题。并查集支持的主要操作: - 初始化:将每个元素初始化为单独的集合,每个集合的初始父节点指向自身 + 初始化:将每个元素初始化为单独的集合,每个集合的初始父节点指向自身。 + 查询:查询某个元素属于哪个集合,通常是返回集合中的一个“代表元素”。这个操作是为 了判断两个元素是否在同一个集合之中。 + 合并:将两个集合合并为一个。 并查集作为一种用于维护集合(组)的数据结构,它通常用于解决一些离线查询、动态连通性和 @@ -37,7 +39,7 @@ Linux中的并查集(Union-Find) https://en.wikipedia.org/wiki/Disjoint-set_data_structure 并查集的Linux实现 ----------------- +------------------ Linux的并查集实现在文件“lib/union_find.c”中。要使用它,需要 “#include ”。 @@ -48,22 +50,25 @@ Linux的并查集实现在文件“lib/union_find.c”中。要使用它,需 struct uf_node *parent; unsigned int rank; }; + 其中parent为当前节点的父节点,rank为当前树的高度,在合并时将rank小的节点接到rank大 的节点下面以增加平衡性。 初始化并查集 ---------- +------------- 可以采用静态或初始化接口完成初始化操作。初始化时,parent 指针指向自身,rank 设置 为 0。 示例:: struct uf_node my_node = UF_INIT_NODE(my_node); + 或 + uf_node_init(&my_node); 查找并查集的根节点 ----------------- +------------------ 主要用于判断两个并查集是否属于一个集合,如果根相同,那么他们就是一个集合。在查找过程中 会对路径进行压缩,提高后续查找效率。 @@ -78,7 +83,7 @@ Linux的并查集实现在文件“lib/union_find.c”中。要使用它,需 connected = 0; 合并两个并查集 -------------- +-------------- 对于两个相交的并查集进行合并,会首先查找它们各自的根节点,然后根据根节点秩大小,将小的 节点连接到大的节点下面。 From 4980f712023a1c0c26a12f5212ced34587d81663 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 5 Aug 2024 00:43:04 +0000 Subject: [PATCH 09/34] cgroup/pids: Remove unreachable paths of pids_{can,cancel}_fork According to the implementation of cgroup_css_set_fork(), it will fail if cset cannot be found and the can_fork/cancel_fork methods will not be called in this case, which means that the argument 'cset' for these methods must not be NULL, so remove the unrechable paths in them. Signed-off-by: Xiu Jianfeng Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/pids.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 34aa63d7c9c6..8f61114c36dd 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -272,15 +272,10 @@ static void pids_event(struct pids_cgroup *pids_forking, */ static int pids_can_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; struct pids_cgroup *pids, *pids_over_limit; int err; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); + pids = css_pids(cset->subsys[pids_cgrp_id]); err = pids_try_charge(pids, 1, &pids_over_limit); if (err) pids_event(pids, pids_over_limit); @@ -290,14 +285,9 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset) static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; struct pids_cgroup *pids; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); + pids = css_pids(cset->subsys[pids_cgrp_id]); pids_uncharge(pids, 1); } From 99570300d3b4c8a1463491754d58e7a8d87cacef Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 4 Aug 2024 21:30:18 -0400 Subject: [PATCH 10/34] cgroup/cpuset: Check for partition roots with overlapping CPUs With the previous commit that eliminates the overlapping partition root corner cases in the hotplug code, the partition roots passed down to generate_sched_domains() should not have overlapping CPUs. Enable overlapping cpuset check for v2 and warn if that happens. This patch also has the benefit of increasing test coverage of the new Union-Find cpuset merging code to cgroup v2. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e070e391d7a8..e34fd6108b06 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1127,25 +1127,27 @@ v2: if (root_load_balance && (csn == 1)) goto single_root_domain; - if (!cgrpv2) { - for (i = 0; i < csn; i++) - uf_node_init(&csa[i]->node); + for (i = 0; i < csn; i++) + uf_node_init(&csa[i]->node); - /* Merge overlapping cpusets */ - for (i = 0; i < csn; i++) { - for (j = i + 1; j < csn; j++) { - if (cpusets_overlap(csa[i], csa[j])) - uf_union(&csa[i]->node, &csa[j]->node); + /* Merge overlapping cpusets */ + for (i = 0; i < csn; i++) { + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) { + /* + * Cgroup v2 shouldn't pass down overlapping + * partition root cpusets. + */ + WARN_ON_ONCE(cgrpv2); + uf_union(&csa[i]->node, &csa[j]->node); } } + } - /* Count the total number of domains */ - for (i = 0; i < csn; i++) { - if (uf_find(&csa[i]->node) == &csa[i]->node) - ndoms++; - } - } else { - ndoms = csn; + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; } /* From 92841d6e23de09f180f948f99ddb40135775cedc Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 4 Aug 2024 21:30:19 -0400 Subject: [PATCH 11/34] selftest/cgroup: Add new test cases to test_cpuset_prs.sh Add new test cases to test_cpuset_prs.sh to cover corner cases reported in previous fix commits. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpuset_prs.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 7c08cc153367..7295424502b9 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -321,7 +321,7 @@ TEST_MATRIX=( # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # - # Incorrect change to cpuset.cpus invalidates partition root + # Incorrect change to cpuset.cpus[.exclusive] invalidates partition root # # Adding CPUs to partition root that are not in parent's # cpuset.cpus is allowed, but those extra CPUs are ignored. @@ -365,6 +365,16 @@ TEST_MATRIX=( # cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it " C0-3 . . C4-5 X5 . . . 0 A1:0-3,B1:4-5" + # Child partition root that try to take all CPUs from parent partition + # with tasks will remain invalid. + " C1-4:P1:S+ P1 . . . . . . 0 A1:1-4,A2:1-4 A1:P1,A2:P-1" + " C1-4:P1:S+ P1 . . . C1-4 . . 0 A1,A2:1-4 A1:P1,A2:P1" + " C1-4:P1:S+ P1 . . T C1-4 . . 0 A1:1-4,A2:1-4 A1:P1,A2:P-1" + + # Clearing of cpuset.cpus with a preset cpuset.cpus.exclusive shouldn't + # affect cpuset.cpus.exclusive.effective. + " C1-4:X3:S+ C1:X3 . . . C . . 0 A2:1-4,XA2:3" + # old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS # ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ -------- # Failure cases: From 9b103943ab281d137df1cdb48dcef329a87e0a06 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 8 Aug 2024 23:22:59 -0400 Subject: [PATCH 12/34] cgroup: Fix incorrect WARN_ON_ONCE() in css_release_work_fn() It turns out that the WARN_ON_ONCE() call in css_release_work_fn introduced by commit ab0312526867 ("cgroup: Show # of subsystem CSSes in cgroup.stat") is incorrect. Although css->nr_descendants must be 0 when a css is released and ready to be freed, the corresponding cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem is activated and deactivated multiple times with one or more of its previous activation leaving behind dying csses. Fix the incorrect warning by removing the cgrp->nr_dying_subsys check. Fixes: ab0312526867 ("cgroup: Show # of subsystem CSSes in cgroup.stat") Closes: https://lore.kernel.org/cgroups/6f301773-2fce-4602-a391-8af7ef00b2fb@redhat.com/T/#t Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 601600afdd20..244ec600b4d8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5465,7 +5465,14 @@ static void css_release_work_fn(struct work_struct *work) ss->css_released(css); cgrp->nr_dying_subsys[ss->id]--; - WARN_ON_ONCE(css->nr_descendants || cgrp->nr_dying_subsys[ss->id]); + /* + * When a css is released and ready to be freed, its + * nr_descendants must be zero. However, the corresponding + * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem + * is activated and deactivated multiple times with one or + * more of its previous activation leaving behind dying csses. + */ + WARN_ON_ONCE(css->nr_descendants); parent_cgrp = cgroup_parent(cgrp); while (parent_cgrp) { parent_cgrp->nr_dying_subsys[ss->id]--; From d1a92d2d6c5dbeba9a87bfb57fa0142cdae7b206 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 15 Aug 2024 13:14:08 +0000 Subject: [PATCH 13/34] cgroup: update some statememt about delegation The comment in cgroup_file_write is missing some interfaces, such as 'cgroup.threads'. All delegatable files are listed in '/sys/kernel/cgroup/delegate', so update the comment in cgroup_file_write. Besides, add a statement that files outside the namespace shouldn't be visible from inside the delegated namespace. tj: Reflowed text for consistency. Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 10 ++++++---- kernel/cgroup/cgroup.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 70cefccd07ce..75f8c403f77a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -533,10 +533,12 @@ cgroup namespace on namespace creation. Because the resource control interface files in a given directory control the distribution of the parent's resources, the delegatee shouldn't be allowed to write to them. For the first method, this is -achieved by not granting access to these files. For the second, the -kernel rejects writes to all files other than "cgroup.procs" and -"cgroup.subtree_control" on a namespace root from inside the -namespace. +achieved by not granting access to these files. For the second, files +outside the namespace should be hidden from the delegatee by the means +of at least mount namespacing, and the kernel rejects writes to all +files on a namespace root from inside the cgroup namespace, except for +those files listed in "/sys/kernel/cgroup/delegate" (including +"cgroup.procs", "cgroup.threads", "cgroup.subtree_control", etc.). The end results are equivalent for both delegation types. Once delegated, the user can build sub-hierarchy under the directory, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 244ec600b4d8..c72e18ffbfd8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4124,7 +4124,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - - * cgroup.procs and cgroup.subtree_control. + * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && From e55f45b4bafee0ef488e815aa88368444ec5c79c Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 20 Aug 2024 03:01:24 +0000 Subject: [PATCH 14/34] cgroup/cpuset: Correct invalid remote parition prs When enable a remote partition, I found that: cd /sys/fs/cgroup/ mkdir test mkdir test/test1 echo +cpuset > cgroup.subtree_control echo +cpuset > test/cgroup.subtree_control echo 3 > test/test1/cpuset.cpus echo root > test/test1/cpuset.cpus.partition cat test/test1/cpuset.cpus.partition root invalid (Parent is not a partition root) The parent of a remote partition could not be a root. This is due to the emtpy effective_xcpus. It would be better to prompt the message "invalid cpu list in cpuset.cpus.exclusive". Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e34fd6108b06..0ae68e0e5733 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -80,6 +80,7 @@ enum prs_errcode { PERR_HOTPLUG, PERR_CPUSEMPTY, PERR_HKEEPING, + PERR_ACCESS, }; static const char * const perr_strings[] = { @@ -91,6 +92,7 @@ static const char * const perr_strings[] = { [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", + [PERR_ACCESS] = "Enable partition not permitted", }; struct cpuset { @@ -1655,7 +1657,7 @@ static inline bool is_local_partition(struct cpuset *cs) * @cs: the cpuset to update * @new_prs: new partition_root_state * @tmp: temparary masks - * Return: 1 if successful, 0 if error + * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. @@ -1669,7 +1671,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * The user must have sysadmin privilege. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return PERR_ACCESS; /* * The requested exclusive_cpus must not be allocated to other @@ -1683,7 +1685,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, if (cpumask_empty(tmp->new_cpus) || cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) - return 0; + return PERR_INVCPUS; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); @@ -1698,7 +1700,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, */ update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); - return 1; + return 0; } /* @@ -3151,9 +3153,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; if (!old_prs) { - enum partition_cmd cmd = (new_prs == PRS_ROOT) - ? partcmd_enable : partcmd_enablei; - /* * cpus_allowed and exclusive_cpus cannot be both empty. */ @@ -3162,13 +3161,18 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; } - err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); /* - * If an attempt to become local partition root fails, - * try to become a remote partition root instead. + * If parent is valid partition, enable local partiion. + * Otherwise, enable a remote partition. */ - if (err && remote_partition_enable(cs, new_prs, &tmpmask)) - err = 0; + if (is_partition_valid(parent)) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); + } else { + err = remote_partition_enable(cs, new_prs, &tmpmask); + } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. From 9414f68d454529ff7e68f0c2aefe0a007060c66a Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 20 Aug 2024 03:01:25 +0000 Subject: [PATCH 15/34] cgroup/cpuset: remove fetch_xcpus Both fetch_xcpus and user_xcpus functions are used to retrieve the value of exclusive_cpus. If exclusive_cpus is not set, cpus_allowed is the implicit value used as exclusive in a local partition. I can not imagine a scenario where effective_xcpus is not empty when exclusive_cpus is empty. Therefore, I suggest removing the fetch_xcpus function. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0ae68e0e5733..92e79ddc8188 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -771,13 +771,6 @@ static inline bool xcpus_empty(struct cpuset *cs) cpumask_empty(cs->exclusive_cpus); } -static inline struct cpumask *fetch_xcpus(struct cpuset *cs) -{ - return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : - cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed - : cs->effective_xcpus; -} - /* * cpusets_are_exclusive() - check if two cpusets are exclusive * @@ -785,8 +778,8 @@ static inline struct cpumask *fetch_xcpus(struct cpuset *cs) */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { - struct cpumask *xcpus1 = fetch_xcpus(cs1); - struct cpumask *xcpus2 = fetch_xcpus(cs2); + struct cpumask *xcpus1 = user_xcpus(cs1); + struct cpumask *xcpus2 = user_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) return false; @@ -2585,7 +2578,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, invalidate = true; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = fetch_xcpus(trialcs); + struct cpumask *xcpus = user_xcpus(trialcs); if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { From 3c2acae88844e7423a50b5cbe0a2c9d430fcd20c Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 20 Aug 2024 03:01:26 +0000 Subject: [PATCH 16/34] cgroup/cpuset: remove use_parent_ecpus of cpuset use_parent_ecpus is used to track whether the children are using the parent's effective_cpus. When a parent's effective_cpus is changed due to changes in a child partition's effective_xcpus, any child using parent'effective_cpus must call update_cpumasks_hier. However, if a child is not a valid partition, it is sufficient to determine whether to call update_cpumasks_hier based on whether the child's effective_cpus is going to change. To make the code more succinct, it is suggested to remove use_parent_ecpus. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 30 ++++-------------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 92e79ddc8188..7db55eed63cf 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -185,12 +185,6 @@ struct cpuset { /* partition root state */ int partition_root_state; - /* - * Default hierarchy only: - * use_parent_ecpus - set if using parent's effective_cpus - */ - int use_parent_ecpus; - /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we * know when to rebuild associated root domain bandwidth information. @@ -1505,11 +1499,8 @@ static void reset_partition_data(struct cpuset *cs) if (is_cpu_exclusive(cs)) clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); } - if (!cpumask_and(cs->effective_cpus, - parent->effective_cpus, cs->cpus_allowed)) { - cs->use_parent_ecpus = true; + if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) cpumask_copy(cs->effective_cpus, parent->effective_cpus); - } } /* @@ -1683,8 +1674,6 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); - if (cs->use_parent_ecpus) - cs->use_parent_ecpus = false; spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -2309,13 +2298,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * it is a partition root that has explicitly distributed * out all its CPUs. */ - if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (!cp->use_parent_ecpus) - cp->use_parent_ecpus = true; - } else if (cp->use_parent_ecpus) { - cp->use_parent_ecpus = false; - } if (remote) goto get_css; @@ -2452,8 +2436,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * Check all its siblings and call update_cpumasks_hier() * if their effective_cpus will need to be changed. * - * With the addition of effective_xcpus which is a subset of - * cpus_allowed. It is possible a change in parent's effective_cpus + * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus * directly. @@ -2467,8 +2450,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus && - !is_partition_valid(sibling)) { + if (!is_partition_valid(sibling)) { compute_effective_cpumask(tmp->new_cpus, sibling, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) @@ -4128,7 +4110,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; - cs->use_parent_ecpus = true; } spin_unlock_irq(&callback_lock); @@ -4194,9 +4175,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - if (cs->use_parent_ecpus) - cs->use_parent_ecpus = false; - cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); From c188f33c864e3dba49a1ad0dc9fddf2f49ac42ae Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Aug 2024 15:55:35 -0400 Subject: [PATCH 17/34] cgroup/cpuset: Account for boot time isolated CPUs With the "isolcpus" boot command line parameter, we are able to create isolated CPUs at boot time. These isolated CPUs aren't fully accounted for in the cpuset code. For instance, the root cgroup's "cpuset.cpus.isolated" control file does not include the boot time isolated CPUs. Fix that by looking for pre-isolated CPUs at init time. The prstate_housekeeping_conflict() function does check the HK_TYPE_DOMAIN housekeeping cpumask to make sure that CPUs outside of it can only be used in isolated partition. Given the fact that we are going to make housekeeping cpumasks dynamic, the current check may not be right anymore. Save the boot time HK_TYPE_DOMAIN cpumask and check against it instead of the upcoming dynamic HK_TYPE_DOMAIN housekeeping cpumask. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 7db55eed63cf..8b40df89c3c1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -224,6 +224,12 @@ static cpumask_var_t subpartitions_cpus; */ static cpumask_var_t isolated_cpus; +/* + * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot + */ +static cpumask_var_t boot_hk_cpus; +static bool have_boot_isolcpus; + /* List of remote partition root children */ static struct list_head remote_children; @@ -1823,15 +1829,15 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * - * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in - * an isolated partition. + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { - const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); - bool all_in_hk = cpumask_subset(new_cpus, hk_domain); + if (!have_boot_isolcpus) + return false; - if (!all_in_hk && (prstate != PRS_ISOLATED)) + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) return true; return false; @@ -4345,6 +4351,13 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); + have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); + if (have_boot_isolcpus) { + BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); + cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); + } + return 0; } From 43a17fcfcc4294f53ece15425ba362f5e28664c2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Aug 2024 15:55:36 -0400 Subject: [PATCH 18/34] selftest/cgroup: Make test_cpuset_prs.sh deal with pre-isolated CPUs Since isolated CPUs can be reserved at boot time via the "isolcpus" boot command line option, these pre-isolated CPUs may interfere with testing done by test_cpuset_prs.sh. With the previous commit that incorporates those boot time isolated CPUs into "cpuset.cpus.isolated", we can check for those before testing is started to make sure that there will be no interference. Otherwise, this test will be skipped if incorrect test failure can happen. As "cpuset.cpus.isolated" is now available in a non cgroup_debug kernel, we don't need to check for its existence anymore. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- .../selftests/cgroup/test_cpuset_prs.sh | 44 ++++++++++++++----- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index 7295424502b9..03c1bdaed2c3 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -84,6 +84,20 @@ echo member > test/cpuset.cpus.partition echo "" > test/cpuset.cpus [[ $RESULT -eq 0 ]] && skip_test "Child cgroups are using cpuset!" +# +# If isolated CPUs have been reserved at boot time (as shown in +# cpuset.cpus.isolated), these isolated CPUs should be outside of CPUs 0-7 +# that will be used by this script for testing purpose. If not, some of +# the tests may fail incorrectly. These isolated CPUs will also be removed +# before being compared with the expected results. +# +BOOT_ISOLCPUS=$(cat $CGROUP2/cpuset.cpus.isolated) +if [[ -n "$BOOT_ISOLCPUS" ]] +then + [[ $(echo $BOOT_ISOLCPUS | sed -e "s/[,-].*//") -le 7 ]] && + skip_test "Pre-isolated CPUs ($BOOT_ISOLCPUS) overlap CPUs to be tested" + echo "Pre-isolated CPUs: $BOOT_ISOLCPUS" +fi cleanup() { online_cpus @@ -642,7 +656,8 @@ check_cgroup_states() # Note that isolated CPUs from the sched/domains context include offline # CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may # not be included in the cpuset.cpus.isolated control file which contains -# only CPUs in isolated partitions. +# only CPUs in isolated partitions as well as those that are isolated at +# boot time. # # $1 - expected isolated cpu list(s) {,} # - expected sched/domains value @@ -669,18 +684,21 @@ check_isolcpus() fi # - # Check the debug isolated cpumask, if present + # Check cpuset.cpus.isolated cpumask # - [[ -f $ISCPUS ]] && { + if [[ -z "$BOOT_ISOLCPUS" ]] + then + ISOLCPUS=$(cat $ISCPUS) + else + ISOLCPUS=$(cat $ISCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") + fi + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { + # Take a 50ms pause and try again + pause 0.05 ISOLCPUS=$(cat $ISCPUS) - [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && { - # Take a 50ms pause and try again - pause 0.05 - ISOLCPUS=$(cat $ISCPUS) - } - [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 - ISOLCPUS= } + [[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1 + ISOLCPUS= # # Use the sched domain in debugfs to check isolated CPUs, if available @@ -713,6 +731,9 @@ check_isolcpus() fi done [[ "$ISOLCPUS" = *- ]] && ISOLCPUS=${ISOLCPUS}$LASTISOLCPU + [[ -n "BOOT_ISOLCPUS" ]] && + ISOLCPUS=$(echo $ISOLCPUS | sed -e "s/,*$BOOT_ISOLCPUS//") + [[ "$EXPECT_VAL" = "$ISOLCPUS" ]] } @@ -730,7 +751,8 @@ test_fail() } # -# Check to see if there are unexpected isolated CPUs left +# Check to see if there are unexpected isolated CPUs left beyond the boot +# time isolated ones. # null_isolcpus_check() { From 71e934a80863c2a9f4d27ee3360dc17e0a609aa6 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:18 +0000 Subject: [PATCH 19/34] cgroup/cpuset: introduce cpuset-v1.c This patch introduces the cgroup/cpuset-v1.c source file which will be used for all legacy (cgroup v1) cpuset cgroup code. It also introduces cgroup/cpuset-internal.h to keep declarations shared between cgroup/cpuset.c and cpuset/cpuset-v1.c. As of now, let's compile it if CONFIG_CPUSET is set. Later on it can be switched to use a separate config option, so that the legacy code won't be compiled if not required. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- MAINTAINERS | 2 ++ kernel/cgroup/Makefile | 2 +- kernel/cgroup/cpuset-internal.h | 6 ++++++ kernel/cgroup/cpuset-v1.c | 3 +++ 4 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 kernel/cgroup/cpuset-internal.h create mode 100644 kernel/cgroup/cpuset-v1.c diff --git a/MAINTAINERS b/MAINTAINERS index 82e3924816d2..3b5ec1cafd95 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5698,6 +5698,8 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git F: Documentation/admin-guide/cgroup-v1/cpusets.rst F: include/linux/cpuset.h +F: kernel/cgroup/cpuset-internal.h +F: kernel/cgroup/cpuset-v1.c F: kernel/cgroup/cpuset.c F: tools/testing/selftests/cgroup/test_cpuset.c F: tools/testing/selftests/cgroup/test_cpuset_prs.sh diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 12f8457ad1f9..005ac4c675cb 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -4,6 +4,6 @@ obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o -obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CPUSETS) += cpuset.o cpuset-v1.o obj-$(CONFIG_CGROUP_MISC) += misc.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h new file mode 100644 index 000000000000..034de3cbf3ad --- /dev/null +++ b/kernel/cgroup/cpuset-internal.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __CPUSET_INTERNAL_H +#define __CPUSET_INTERNAL_H + +#endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c new file mode 100644 index 000000000000..bdec4b196986 --- /dev/null +++ b/kernel/cgroup/cpuset-v1.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cpuset-internal.h" From 619a33efa0b0cd566f928d60d128033673e478cf Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:19 +0000 Subject: [PATCH 20/34] cgroup/cpuset: move common code to cpuset-internal.h Move some declarations that will be used for cpuset v1 and v2, including 'cpuset struct', 'cpuset_flagbits_t', cpuset_filetype_t,etc. No logical change. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 235 +++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 236 +------------------------------- 2 files changed, 236 insertions(+), 235 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 034de3cbf3ad..ffea3eefebdf 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -3,4 +3,239 @@ #ifndef __CPUSET_INTERNAL_H #define __CPUSET_INTERNAL_H +#include +#include +#include +#include +#include + +/* See "Frequency meter" comments, below. */ + +struct fmeter { + int cnt; /* unprocessed events count */ + int val; /* most recent output value */ + time64_t time; /* clock (secs) when val computed */ + spinlock_t lock; /* guards read or write of above */ +}; + +/* + * Invalid partition error code + */ +enum prs_errcode { + PERR_NONE = 0, + PERR_INVCPUS, + PERR_INVPARENT, + PERR_NOTPART, + PERR_NOTEXCL, + PERR_NOCPUS, + PERR_HOTPLUG, + PERR_CPUSEMPTY, + PERR_HKEEPING, + PERR_ACCESS, +}; + +/* bits in struct cpuset flags field */ +typedef enum { + CS_ONLINE, + CS_CPU_EXCLUSIVE, + CS_MEM_EXCLUSIVE, + CS_MEM_HARDWALL, + CS_MEMORY_MIGRATE, + CS_SCHED_LOAD_BALANCE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, +} cpuset_flagbits_t; + +/* The various types of files and directories in a cpuset file system */ + +typedef enum { + FILE_MEMORY_MIGRATE, + FILE_CPULIST, + FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, + FILE_SUBPARTS_CPULIST, + FILE_EXCLUSIVE_CPULIST, + FILE_EFFECTIVE_XCPULIST, + FILE_ISOLATED_CPULIST, + FILE_CPU_EXCLUSIVE, + FILE_MEM_EXCLUSIVE, + FILE_MEM_HARDWALL, + FILE_SCHED_LOAD_BALANCE, + FILE_PARTITION_ROOT, + FILE_SCHED_RELAX_DOMAIN_LEVEL, + FILE_MEMORY_PRESSURE_ENABLED, + FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, +} cpuset_filetype_t; + +struct cpuset { + struct cgroup_subsys_state css; + + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierarchy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; + + /* + * Exclusive CPUs dedicated to current cgroup (default hierarchy only) + * + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. + * + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. + */ + cpumask_var_t effective_xcpus; + + /* + * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. + */ + cpumask_var_t exclusive_cpus; + + /* + * This is old Memory Nodes tasks took on. + * + * - top_cpuset.old_mems_allowed is initialized to mems_allowed. + * - A new cpuset's old_mems_allowed is initialized when some + * task is moved into it. + * - old_mems_allowed is used in cpuset_migrate_mm() when we change + * cpuset.mems_allowed and have tasks' nodemask updated, and + * then old_mems_allowed is updated to mems_allowed. + */ + nodemask_t old_mems_allowed; + + struct fmeter fmeter; /* memory_pressure filter */ + + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + + /* for custom sched domain */ + int relax_domain_level; + + /* number of valid local child partitions */ + int nr_subparts; + + /* partition root state */ + int partition_root_state; + + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + int nr_migrate_dl_tasks; + u64 sum_migrate_dl_bw; + + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; + + /* Used to merge intersecting subsets for generate_sched_domains */ + struct uf_node node; +}; + +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuset, css) : NULL; +} + +/* Retrieve the cpuset for a task */ +static inline struct cpuset *task_cs(struct task_struct *task) +{ + return css_cs(task_css(task, cpuset_cgrp_id)); +} + +static inline struct cpuset *parent_cs(struct cpuset *cs) +{ + return css_cs(cs->css.parent); +} + +/* convenient tests for these bits */ +static inline bool is_cpuset_online(struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); +} + +static inline int is_cpu_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_hardwall(const struct cpuset *cs) +{ + return test_bit(CS_MEM_HARDWALL, &cs->flags); +} + +static inline int is_sched_load_balance(const struct cpuset *cs) +{ + return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); +} + +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); +} + #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8b40df89c3c1..143d27e58482 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -22,11 +22,9 @@ * distribution for more details. */ #include "cgroup-internal.h" +#include "cpuset-internal.h" #include -#include -#include -#include #include #include #include @@ -40,13 +38,10 @@ #include #include #include -#include #include #include -#include #include #include -#include DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -58,31 +53,6 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time64_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ -}; - -/* - * Invalid partition error code - */ -enum prs_errcode { - PERR_NONE = 0, - PERR_INVCPUS, - PERR_INVPARENT, - PERR_NOTPART, - PERR_NOTEXCL, - PERR_NOCPUS, - PERR_HOTPLUG, - PERR_CPUSEMPTY, - PERR_HKEEPING, - PERR_ACCESS, -}; - static const char * const perr_strings[] = { [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", @@ -95,117 +65,6 @@ static const char * const perr_strings[] = { [PERR_ACCESS] = "Enable partition not permitted", }; -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * On default hierarchy: - * - * The user-configured masks can only be changed by writing to - * cpuset.cpus and cpuset.mems, and won't be limited by the - * parent masks. - * - * The effective masks is the real masks that apply to the tasks - * in the cpuset. They may be changed if the configured masks are - * changed or hotplug happens. - * - * effective_mask == configured_mask & parent's effective_mask, - * and if it ends up empty, it will inherit the parent's mask. - * - * - * On legacy hierarchy: - * - * The user-configured masks are always the same with effective masks. - */ - - /* user-configured CPUs and Memory Nodes allow to tasks */ - cpumask_var_t cpus_allowed; - nodemask_t mems_allowed; - - /* effective CPUs and Memory Nodes allow to tasks */ - cpumask_var_t effective_cpus; - nodemask_t effective_mems; - - /* - * Exclusive CPUs dedicated to current cgroup (default hierarchy only) - * - * The effective_cpus of a valid partition root comes solely from its - * effective_xcpus and some of the effective_xcpus may be distributed - * to sub-partitions below & hence excluded from its effective_cpus. - * For a valid partition root, its effective_cpus have no relationship - * with cpus_allowed unless its exclusive_cpus isn't set. - * - * This value will only be set if either exclusive_cpus is set or - * when this cpuset becomes a local partition root. - */ - cpumask_var_t effective_xcpus; - - /* - * Exclusive CPUs as requested by the user (default hierarchy only) - * - * Its value is independent of cpus_allowed and designates the set of - * CPUs that can be granted to the current cpuset or its children when - * it becomes a valid partition root. The effective set of exclusive - * CPUs granted (effective_xcpus) depends on whether those exclusive - * CPUs are passed down by its ancestors and not yet taken up by - * another sibling partition root along the way. - * - * If its value isn't set, it defaults to cpus_allowed. - */ - cpumask_var_t exclusive_cpus; - - /* - * This is old Memory Nodes tasks took on. - * - * - top_cpuset.old_mems_allowed is initialized to mems_allowed. - * - A new cpuset's old_mems_allowed is initialized when some - * task is moved into it. - * - old_mems_allowed is used in cpuset_migrate_mm() when we change - * cpuset.mems_allowed and have tasks' nodemask updated, and - * then old_mems_allowed is updated to mems_allowed. - */ - nodemask_t old_mems_allowed; - - struct fmeter fmeter; /* memory_pressure filter */ - - /* - * Tasks are being attached to this cpuset. Used to prevent - * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). - */ - int attach_in_progress; - - /* for custom sched domain */ - int relax_domain_level; - - /* number of valid local child partitions */ - int nr_subparts; - - /* partition root state */ - int partition_root_state; - - /* - * number of SCHED_DEADLINE tasks attached to this cpuset, so that we - * know when to rebuild associated root domain bandwidth information. - */ - int nr_deadline_tasks; - int nr_migrate_dl_tasks; - u64 sum_migrate_dl_bw; - - /* Invalid partition error code, not lock protected */ - enum prs_errcode prs_err; - - /* Handle for cpuset.cpus.partition */ - struct cgroup_file partition_file; - - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; - - /* Used to merge intersecting subsets for generate_sched_domains */ - struct uf_node node; -}; - /* * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously */ @@ -280,22 +139,6 @@ struct tmpmasks { cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; -static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct cpuset, css) : NULL; -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return css_cs(task_css(task, cpuset_cgrp_id)); -} - -static inline struct cpuset *parent_cs(struct cpuset *cs) -{ - return css_cs(cs->css.parent); -} - void inc_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); @@ -310,59 +153,6 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -/* bits in struct cpuset flags field */ -typedef enum { - CS_ONLINE, - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; - -/* convenient tests for these bits */ -static inline bool is_cpuset_online(struct cpuset *cs) -{ - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); -} - -static inline int is_cpu_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_hardwall(const struct cpuset *cs) -{ - return test_bit(CS_MEM_HARDWALL, &cs->flags); -} - -static inline int is_sched_load_balance(const struct cpuset *cs) -{ - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -} - -static inline int is_memory_migrate(const struct cpuset *cs) -{ - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); -} - -static inline int is_spread_page(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_PAGE, &cs->flags); -} - -static inline int is_spread_slab(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_SLAB, &cs->flags); -} - static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; @@ -3535,30 +3325,6 @@ out: mutex_unlock(&cpuset_mutex); } -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_EFFECTIVE_CPULIST, - FILE_EFFECTIVE_MEMLIST, - FILE_SUBPARTS_CPULIST, - FILE_EXCLUSIVE_CPULIST, - FILE_EFFECTIVE_XCPULIST, - FILE_ISOLATED_CPULIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_PARTITION_ROOT, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { From 49434094efbbe360dd7fcf934fef239a01a95bb7 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:20 +0000 Subject: [PATCH 21/34] cgroup/cpuset: move memory_pressure to cpuset-v1.c Collection of memory_pressure can be enabled by writing 1 to the cpuset file 'memory_pressure_enabled', which is only for cpuset-v1. Therefore, move the corresponding code to cpuset-v1.c. Currently, the 'fmeter_init' and 'fmeter_getrate' functions are called at cpuset.c, so expose them to cpuset.c. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 7 ++ kernel/cgroup/cpuset-v1.c | 134 ++++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 134 -------------------------------- 3 files changed, 141 insertions(+), 134 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index ffea3eefebdf..7911c86bf012 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -238,4 +238,11 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +/* + * cpuset-v1.c + */ + +void fmeter_init(struct fmeter *fmp); +int fmeter_getrate(struct fmeter *fmp); + #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index bdec4b196986..e7d137ff57cf 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -1,3 +1,137 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "cpuset-internal.h" + +/* + * Frequency meter - How fast is some event occurring? + * + * These routines manage a digitally filtered, constant time based, + * event frequency meter. There are four routines: + * fmeter_init() - initialize a frequency meter. + * fmeter_markevent() - called each time the event happens. + * fmeter_getrate() - returns the recent rate of such events. + * fmeter_update() - internal routine used to update fmeter. + * + * A common data structure is passed to each of these routines, + * which is used to keep track of the state required to manage the + * frequency meter and its digital filter. + * + * The filter works on the number of events marked per unit time. + * The filter is single-pole low-pass recursive (IIR). The time unit + * is 1 second. Arithmetic is done using 32-bit integers scaled to + * simulate 3 decimal digits of precision (multiplied by 1000). + * + * With an FM_COEF of 933, and a time base of 1 second, the filter + * has a half-life of 10 seconds, meaning that if the events quit + * happening, then the rate returned from the fmeter_getrate() + * will be cut in half each 10 seconds, until it converges to zero. + * + * It is not worth doing a real infinitely recursive filter. If more + * than FM_MAXTICKS ticks have elapsed since the last filter event, + * just compute FM_MAXTICKS ticks worth, by which point the level + * will be stable. + * + * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid + * arithmetic overflow in the fmeter_update() routine. + * + * Given the simple 32 bit integer arithmetic used, this meter works + * best for reporting rates between one per millisecond (msec) and + * one per 32 (approx) seconds. At constant rates faster than one + * per msec it maxes out at values just under 1,000,000. At constant + * rates between one per msec, and one per second it will stabilize + * to a value N*1000, where N is the rate of events per second. + * At constant rates between one per second and one per 32 seconds, + * it will be choppy, moving up on the seconds that have an event, + * and then decaying until the next event. At rates slower than + * about one in 32 seconds, it decays all the way back to zero between + * each event. + */ + +#define FM_COEF 933 /* coefficient for half-life of 10 secs */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ +#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ +#define FM_SCALE 1000 /* faux fixed point scale */ + +/* Initialize a frequency meter */ +void fmeter_init(struct fmeter *fmp) +{ + fmp->cnt = 0; + fmp->val = 0; + fmp->time = 0; + spin_lock_init(&fmp->lock); +} + +/* Internal meter update - process cnt events and update value */ +static void fmeter_update(struct fmeter *fmp) +{ + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; + + if (ticks == 0) + return; + + ticks = min(FM_MAXTICKS, ticks); + while (ticks-- > 0) + fmp->val = (FM_COEF * fmp->val) / FM_SCALE; + fmp->time = now; + + fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; + fmp->cnt = 0; +} + +/* Process any previous ticks, then bump cnt by one (times scale). */ +static void fmeter_markevent(struct fmeter *fmp) +{ + spin_lock(&fmp->lock); + fmeter_update(fmp); + fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); + spin_unlock(&fmp->lock); +} + +/* Process any previous ticks, then return current value. */ +int fmeter_getrate(struct fmeter *fmp) +{ + int val; + + spin_lock(&fmp->lock); + fmeter_update(fmp); + val = fmp->val; + spin_unlock(&fmp->lock); + return val; +} + +/* + * Collection of memory_pressure is suppressed unless + * this flag is enabled by writing "1" to the special + * cpuset file 'memory_pressure_enabled' in the root cpuset. + */ + +int cpuset_memory_pressure_enabled __read_mostly; + +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. + * + * Keep a running average of the rate of synchronous (direct) + * page reclaim efforts initiated by tasks in each cpuset. + * + * This represents the rate at which some task in the cpuset + * ran low on memory on all nodes it was allowed to use, and + * had to enter the kernels page reclaim code in an effort to + * create more free memory by tossing clean pages or swapping + * or writing dirty pages. + * + * Display to user space in the per-cpuset read-only file + * "memory_pressure". Value displayed is an integer + * representing the recent rate of entry into the synchronous + * (direct) page reclaim by any task attached to the cpuset. + */ + +void __cpuset_memory_pressure_bump(void) +{ + rcu_read_lock(); + fmeter_markevent(&task_cs(current)->fmeter); + rcu_read_unlock(); +} diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 143d27e58482..c5026a296ede 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2996,107 +2996,6 @@ out: return 0; } -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time64_t now; - u32 ticks; - - now = ktime_get_seconds(); - ticks = now - fmp->time; - - if (ticks == 0) - return; - - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; - - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; -} - -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) -{ - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); -} - -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) -{ - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; -} - static struct cpuset *cpuset_attach_old_cs; /* @@ -4793,39 +4692,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/* - * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - */ - -void __cpuset_memory_pressure_bump(void) -{ - rcu_read_lock(); - fmeter_markevent(&task_cs(current)->fmeter); - rcu_read_unlock(); -} - #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() From 047b830974488f39e320a3f7403890762e527d27 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:21 +0000 Subject: [PATCH 22/34] cgroup/cpuset: move relax_domain_level to cpuset-v1.c Setting domain level is not supported at cpuset v2, so move corresponding code into cpuset-v1.c. The 'cpuset_write_s64' and 'cpuset_read_s64' are only used for setting domain level, move them to cpuset-v1.c. Currently, expose to cpuset.c. After cpuset legacy interface files are move to cpuset-v1.c, they can be static. The 'rebuild_sched_domains_locked' is exposed to cpuset-v1.c. The change from original code is that using 'cpuset_lock' and 'cpuset_unlock' functions to lock or unlock cpuset_mutex. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 6 +++- kernel/cgroup/cpuset-v1.c | 59 +++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 62 ++------------------------------- 3 files changed, 66 insertions(+), 61 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 7911c86bf012..1058a45f05ec 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -238,11 +238,15 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +void rebuild_sched_domains_locked(void); + /* * cpuset-v1.c */ - void fmeter_init(struct fmeter *fmp); int fmeter_getrate(struct fmeter *fmp); +int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val); +s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft); #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index e7d137ff57cf..c7b321029cbb 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -135,3 +135,62 @@ void __cpuset_memory_pressure_bump(void) fmeter_markevent(&task_cs(current)->fmeter); rcu_read_unlock(); } + +static int update_relax_domain_level(struct cpuset *cs, s64 val) +{ +#ifdef CONFIG_SMP + if (val < -1 || val > sched_domain_level_max + 1) + return -EINVAL; +#endif + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) + rebuild_sched_domains_locked(); + } + + return 0; +} + +int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = -ENODEV; + + cpus_read_lock(); + cpuset_lock(); + if (!is_cpuset_online(cs)) + goto out_unlock; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_unlock(); + cpus_read_unlock(); + return retval; +} + +s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + return cs->relax_domain_level; + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c5026a296ede..c4d165fc0f70 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1075,7 +1075,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; @@ -1127,7 +1127,7 @@ static void rebuild_sched_domains_locked(void) partition_and_rebuild_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ @@ -2794,23 +2794,6 @@ bool current_cpuset_is_being_rebound(void) return ret; } -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ -#ifdef CONFIG_SMP - if (val < -1 || val > sched_domain_level_max + 1) - return -EINVAL; -#endif - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpumask_empty(cs->cpus_allowed) && - is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); - } - - return 0; -} - /** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed @@ -3273,32 +3256,6 @@ out_unlock: return retval; } -static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = -ENODEV; - - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; -} - /* * Common handling for a write to a "cpus" or "mems" file. */ @@ -3449,21 +3406,6 @@ static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) return 0; } -static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); From 90eec9548da6e8aa2eb9f13396a0b617856a38e6 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:22 +0000 Subject: [PATCH 23/34] cgroup/cpuset: move memory_spread to cpuset-v1.c 'memory_spread' is only set in cpuset v1. move corresponding code into cpuset-v1.c. Currently, 'cpuset_update_task_spread_flags' and 'update_tasks_flags' are exposed to cpuset.c. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 3 +++ kernel/cgroup/cpuset-v1.c | 42 +++++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 42 --------------------------------- 3 files changed, 45 insertions(+), 42 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 1058a45f05ec..02c4b0c74fa9 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -248,5 +248,8 @@ int fmeter_getrate(struct fmeter *fmp); int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft); +void cpuset_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk); +void update_tasks_flags(struct cpuset *cs); #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index c7b321029cbb..ca973b4de38a 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -194,3 +194,45 @@ s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) /* Unreachable but makes gcc happy */ return 0; } + +/* + * update task's spread flag if cpuset's page/slab spread flag is set + * + * Call with callback_lock or cpuset_mutex held. The check can be skipped + * if on default hierarchy. + */ +void cpuset_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) +{ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + + if (is_spread_page(cs)) + task_set_spread_page(tsk); + else + task_clear_spread_page(tsk); + + if (is_spread_slab(cs)) + task_set_spread_slab(tsk); + else + task_clear_spread_slab(tsk); +} + +/** + * update_tasks_flags - update the spread flags of tasks in the cpuset. + * @cs: the cpuset in which each task's spread flags needs to be changed + * + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. + */ +void update_tasks_flags(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + cpuset_update_task_spread_flags(cs, task); + css_task_iter_end(&it); +} diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c4d165fc0f70..c77f35fb8347 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -407,29 +407,6 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } -/* - * update task's spread flag if cpuset's page/slab spread flag is set - * - * Call with callback_lock or cpuset_mutex held. The check can be skipped - * if on default hierarchy. - */ -static void cpuset_update_task_spread_flags(struct cpuset *cs, - struct task_struct *tsk) -{ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) - return; - - if (is_spread_page(cs)) - task_set_spread_page(tsk); - else - task_clear_spread_page(tsk); - - if (is_spread_slab(cs)) - task_set_spread_slab(tsk); - else - task_clear_spread_slab(tsk); -} - /* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * @@ -2794,25 +2771,6 @@ bool current_cpuset_is_being_rebound(void) return ret; } -/** - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed - * - * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays - * stable. - */ -static void update_tasks_flags(struct cpuset *cs) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flags(cs, task); - css_task_iter_end(&it); -} - /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) From 530020f28f55238cfcc9d9af4e90bc06327f6542 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:23 +0000 Subject: [PATCH 24/34] cgroup/cpuset: add callback_lock helper To modify cpuset, both cpuset_mutex and callback_lock are needed. Add helpers for cpuset-v1 to get callback_lock. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 2 ++ kernel/cgroup/cpuset.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 02c4b0c74fa9..9a60dd6681e4 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -239,6 +239,8 @@ static inline int is_spread_slab(const struct cpuset *cs) } void rebuild_sched_domains_locked(void); +void callback_lock_irq(void); +void callback_unlock_irq(void); /* * cpuset-v1.c diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c77f35fb8347..8a9a7fe1ec1e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -275,6 +275,16 @@ void cpuset_unlock(void) static DEFINE_SPINLOCK(callback_lock); +void callback_lock_irq(void) +{ + spin_lock_irq(&callback_lock); +} + +void callback_unlock_irq(void) +{ + spin_unlock_irq(&callback_lock); +} + static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); From 23ca5237e3d10c899de7a8311d13e38ed7d2f2d5 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:24 +0000 Subject: [PATCH 25/34] cgroup/cpuset: move legacy hotplug update to cpuset-v1.c There are some differents about hotplug update between cpuset v1 and cpuset v2. Move the legacy code to cpuset-v1.c. 'update_tasks_cpumask' and 'update_tasks_nodemask' are both used in cpuset v1 and cpuset v2, declare them in cpuset-internal.h. The change from original code is that use callback_lock helpers to get callback_lock lock/unlock. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 5 ++ kernel/cgroup/cpuset-v1.c | 91 +++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 96 +-------------------------------- 3 files changed, 98 insertions(+), 94 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 9a60dd6681e4..7cd30ad809d5 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -241,6 +241,8 @@ static inline int is_spread_slab(const struct cpuset *cs) void rebuild_sched_domains_locked(void); void callback_lock_irq(void); void callback_unlock_irq(void); +void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); +void update_tasks_nodemask(struct cpuset *cs); /* * cpuset-v1.c @@ -253,5 +255,8 @@ s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft); void cpuset_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void update_tasks_flags(struct cpuset *cs); +void hotplug_update_tasks_legacy(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated); #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index ca973b4de38a..ebc71c5d2568 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -2,6 +2,14 @@ #include "cpuset-internal.h" +/* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + /* * Frequency meter - How fast is some event occurring? * @@ -236,3 +244,86 @@ void update_tasks_flags(struct cpuset *cs) cpuset_update_task_spread_flags(cs, task); css_task_iter_end(&it); } + +/* + * If CPU and/or memory hotplug handlers, below, unplug any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then move the tasks in the empty + * cpuset to its next-highest non-empty parent. + */ +static void remove_tasks_in_empty_cpuset(struct cpuset *cs) +{ + struct cpuset *parent; + + /* + * Find its next-highest non-empty parent, (top cpuset + * has online cpus, so can't be empty). + */ + parent = parent_cs(cs); + while (cpumask_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) + parent = parent_cs(parent); + + if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); + } +} + +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + +void hotplug_update_tasks_legacy(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + callback_lock_irq(); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + callback_unlock_irq(); + + /* + * Don't call update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migrated to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + update_tasks_cpumask(cs, new_cpus); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Execute it asynchronously using workqueue. + */ + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); + } +} diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8a9a7fe1ec1e..1270c7913af9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -65,14 +65,6 @@ static const char * const perr_strings[] = { [PERR_ACCESS] = "Enable partition not permitted", }; -/* - * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously - */ -struct cpuset_remove_tasks_struct { - struct work_struct work; - struct cpuset *cs; -}; - /* * Exclusive CPUs distributed out to sub-partitions of top_cpuset */ @@ -1144,7 +1136,7 @@ void rebuild_sched_domains(void) * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. */ -static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) +void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -2597,7 +2589,7 @@ static void *cpuset_being_rebound; * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs) +void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; @@ -3936,90 +3928,6 @@ int __init cpuset_init(void) return 0; } -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = parent_cs(cs); - while (cpumask_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent_cs(parent); - - if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - pr_err("cpuset: failed to transfer tasks out of empty cpuset "); - pr_cont_cgroup_name(cs->css.cgroup); - pr_cont("\n"); - } -} - -static void cpuset_migrate_tasks_workfn(struct work_struct *work) -{ - struct cpuset_remove_tasks_struct *s; - - s = container_of(work, struct cpuset_remove_tasks_struct, work); - remove_tasks_in_empty_cpuset(s->cs); - css_put(&s->cs->css); - kfree(s); -} - -static void -hotplug_update_tasks_legacy(struct cpuset *cs, - struct cpumask *new_cpus, nodemask_t *new_mems, - bool cpus_updated, bool mems_updated) -{ - bool is_empty; - - spin_lock_irq(&callback_lock); - cpumask_copy(cs->cpus_allowed, new_cpus); - cpumask_copy(cs->effective_cpus, new_cpus); - cs->mems_allowed = *new_mems; - cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); - - /* - * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migrated to an ancestor. - */ - if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs, new_cpus); - if (mems_updated && !nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs); - - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); - - /* - * Move tasks to the nearest ancestor with execution resources, - * This is full cgroup operation which will also call back into - * cpuset. Execute it asynchronously using workqueue. - */ - if (is_empty && cs->css.cgroup->nr_populated_csets && - css_tryget_online(&cs->css)) { - struct cpuset_remove_tasks_struct *s; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (WARN_ON_ONCE(!s)) { - css_put(&cs->css); - return; - } - - s->cs = cs; - INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); - schedule_work(&s->work); - } -} - static void hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, From be126b5b1bd893e514920bbe7a2e00ffabcc9ce0 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:25 +0000 Subject: [PATCH 26/34] cgroup/cpuset: move validate_change_legacy to cpuset-v1.c The validate_change_legacy functions is used for v1, move it to cpuset-v1.c. And two micro 'cpuset_for_each_child' and 'cpuset_for_each_descendant_pre' are common for v1 and v2, move them to cpuset-internal.h. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 29 +++++++++++++ kernel/cgroup/cpuset-v1.c | 45 ++++++++++++++++++++ kernel/cgroup/cpuset.c | 73 --------------------------------- 3 files changed, 74 insertions(+), 73 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 7cd30ad809d5..07551ff0812e 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -238,6 +238,34 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_css: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) + void rebuild_sched_domains_locked(void); void callback_lock_irq(void); void callback_unlock_irq(void); @@ -258,5 +286,6 @@ void update_tasks_flags(struct cpuset *cs); void hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); +int validate_change_legacy(struct cpuset *cur, struct cpuset *trial); #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index ebc71c5d2568..c9e6c5590117 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -327,3 +327,48 @@ void hotplug_update_tasks_legacy(struct cpuset *cs, schedule_work(&s->work); } } + +/* + * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags + * are only set if the other's are set. Call holding cpuset_mutex. + */ + +static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +{ + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + nodes_subset(p->mems_allowed, q->mems_allowed) && + is_cpu_exclusive(p) <= is_cpu_exclusive(q) && + is_mem_exclusive(p) <= is_mem_exclusive(q); +} + +/* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1270c7913af9..175eaa491f21 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -186,34 +186,6 @@ static struct cpuset top_cpuset = { .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; -/** - * cpuset_for_each_child - traverse online children of a cpuset - * @child_cs: loop cursor pointing to the current child - * @pos_css: used for iteration - * @parent_cs: target cpuset to walk children of - * - * Walk @child_cs through the online children of @parent_cs. Must be used - * with RCU read locked. - */ -#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ - css_for_each_child((pos_css), &(parent_cs)->css) \ - if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) - -/** - * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants - * @des_cs: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @root_cs: target cpuset to walk ancestor of - * - * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. @root_cs is included in the - * iteration and the first node to be visited. - */ -#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ - css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ - if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) - /* * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. We also require taking task_lock() when dereferencing a @@ -409,22 +381,6 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); -} - /** * alloc_cpumasks - allocate three cpumasks for cpuset * @cs: the cpuset that have cpumasks to be allocated. @@ -555,35 +511,6 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) return true; } -/* - * validate_change_legacy() - Validate conditions specific to legacy (v1) - * behavior. - */ -static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) -{ - struct cgroup_subsys_state *css; - struct cpuset *c, *par; - int ret; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; - - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - par = parent_cs(cur); - if (par && !is_cpuset_subset(trial, par)) - goto out; - - ret = 0; -out: - return ret; -} - /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. From b0ced9d378d4995f0b84463fa11ce13908dd5f21 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:26 +0000 Subject: [PATCH 27/34] cgroup/cpuset: move v1 interfaces to cpuset-v1.c Move legacy cpuset controller interfaces files and corresponding code into cpuset-v1.c. 'update_flag', 'cpuset_write_resmask' and 'cpuset_common_seq_show' are also used for v1, so declare them in cpuset-internal.h. 'cpuset_write_s64', 'cpuset_read_s64' and 'fmeter_getrate' are only used cpuset-v1.c now, make it static. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 9 +- kernel/cgroup/cpuset-v1.c | 194 ++++++++++++++++++++++++++++++- kernel/cgroup/cpuset.c | 195 +------------------------------- 3 files changed, 199 insertions(+), 199 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 07551ff0812e..a6c71c86e58d 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -271,15 +271,16 @@ void callback_lock_irq(void); void callback_unlock_irq(void); void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); void update_tasks_nodemask(struct cpuset *cs); +int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int cpuset_common_seq_show(struct seq_file *sf, void *v); /* * cpuset-v1.c */ +extern struct cftype legacy_files[]; void fmeter_init(struct fmeter *fmp); -int fmeter_getrate(struct fmeter *fmp); -int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val); -s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft); void cpuset_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void update_tasks_flags(struct cpuset *cs); diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index c9e6c5590117..0ccc440c468a 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -100,7 +100,7 @@ static void fmeter_markevent(struct fmeter *fmp) } /* Process any previous ticks, then return current value. */ -int fmeter_getrate(struct fmeter *fmp) +static int fmeter_getrate(struct fmeter *fmp) { int val; @@ -161,7 +161,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) return 0; } -int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 val) { struct cpuset *cs = css_cs(css); @@ -187,7 +187,7 @@ out_unlock: return retval; } -s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; @@ -372,3 +372,191 @@ int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) out: return ret; } + +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_CPU_EXCLUSIVE: + return is_cpu_exclusive(cs); + case FILE_MEM_EXCLUSIVE: + return is_mem_exclusive(cs); + case FILE_MEM_HARDWALL: + return is_mem_hardwall(cs); + case FILE_SCHED_LOAD_BALANCE: + return is_sched_load_balance(cs); + case FILE_MEMORY_MIGRATE: + return is_memory_migrate(cs); + case FILE_MEMORY_PRESSURE_ENABLED: + return cpuset_memory_pressure_enabled; + case FILE_MEMORY_PRESSURE: + return fmeter_getrate(&cs->fmeter); + case FILE_SPREAD_PAGE: + return is_spread_page(cs); + case FILE_SPREAD_SLAB: + return is_spread_slab(cs); + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = 0; + + cpus_read_lock(); + cpuset_lock(); + if (!is_cpuset_online(cs)) { + retval = -ENODEV; + goto out_unlock; + } + + switch (type) { + case FILE_CPU_EXCLUSIVE: + retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); + break; + case FILE_MEM_EXCLUSIVE: + retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); + break; + case FILE_MEM_HARDWALL: + retval = update_flag(CS_MEM_HARDWALL, cs, val); + break; + case FILE_SCHED_LOAD_BALANCE: + retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); + break; + case FILE_MEMORY_MIGRATE: + retval = update_flag(CS_MEMORY_MIGRATE, cs, val); + break; + case FILE_MEMORY_PRESSURE_ENABLED: + cpuset_memory_pressure_enabled = !!val; + break; + case FILE_SPREAD_PAGE: + retval = update_flag(CS_SPREAD_PAGE, cs, val); + break; + case FILE_SPREAD_SLAB: + retval = update_flag(CS_SPREAD_SLAB, cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_unlock(); + cpus_read_unlock(); + return retval; +} + +/* + * for the common functions, 'private' gives the type of file + */ + +struct cftype legacy_files[] = { + { + .name = "cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_CPULIST, + }, + + { + .name = "mems", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), + .private = FILE_MEMLIST, + }, + + { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { + .name = "cpu_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_CPU_EXCLUSIVE, + }, + + { + .name = "mem_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_EXCLUSIVE, + }, + + { + .name = "mem_hardwall", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_HARDWALL, + }, + + { + .name = "sched_load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + }, + + { + .name = "sched_relax_domain_level", + .read_s64 = cpuset_read_s64, + .write_s64 = cpuset_write_s64, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, + }, + + { + .name = "memory_migrate", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_MIGRATE, + }, + + { + .name = "memory_pressure", + .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, + }, + + { + .name = "memory_spread_page", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_PAGE, + }, + + { + /* obsolete, may be removed in the future */ + .name = "memory_spread_slab", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_SLAB, + }, + + { + .name = "memory_pressure_enabled", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE_ENABLED, + }, + + { } /* terminate */ +}; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 175eaa491f21..f81c8fdbc18d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1113,8 +1113,6 @@ enum partition_cmd { partcmd_invalidate, /* Make partition invalid */ }; -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, - int turning_on); static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); @@ -2709,7 +2707,7 @@ bool current_cpuset_is_being_rebound(void) * Call with cpuset_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; @@ -3094,59 +3092,10 @@ out: mutex_unlock(&cpuset_mutex); } -static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = 0; - - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) { - retval = -ENODEV; - goto out_unlock; - } - - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; -} - /* * Common handling for a write to a "cpus" or "mems" file. */ -static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3221,7 +3170,7 @@ out_unlock: * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ -static int cpuset_common_seq_show(struct seq_file *sf, void *v) +int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; @@ -3262,37 +3211,6 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); @@ -3356,113 +3274,6 @@ out_unlock: return retval ?: nbytes; } -/* - * for the common functions, 'private' gives the type of file - */ - -static struct cftype legacy_files[] = { - { - .name = "cpus", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, - }, - - { - .name = "mems", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, - }, - - { - .name = "effective_cpus", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_CPULIST, - }, - - { - .name = "effective_mems", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_MEMLIST, - }, - - { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .private = FILE_MEMORY_PRESSURE, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - /* obsolete, may be removed in the future */ - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, - - { - .name = "memory_pressure_enabled", - .flags = CFTYPE_ONLY_ON_ROOT, - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, - }, - - { } /* terminate */ -}; - /* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. From 381b53c3b5494026199a11a2744074086e352b2b Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:27 +0000 Subject: [PATCH 28/34] cgroup/cpuset: rename functions shared between v1 and v2 Some functions name declared in cpuset-internel.h are generic. To avoid confilicting with other variables for the same name, rename these functions with cpuset_/cpuset1_ prefix to make them unique to cpuset. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 20 ++++++------- kernel/cgroup/cpuset-v1.c | 40 ++++++++++++------------- kernel/cgroup/cpuset.c | 52 ++++++++++++++++----------------- 3 files changed, 56 insertions(+), 56 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index a6c71c86e58d..f36419d688bd 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -267,11 +267,11 @@ static inline int is_spread_slab(const struct cpuset *cs) if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) void rebuild_sched_domains_locked(void); -void callback_lock_irq(void); -void callback_unlock_irq(void); -void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); -void update_tasks_nodemask(struct cpuset *cs); -int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +void cpuset_callback_lock_irq(void); +void cpuset_callback_unlock_irq(void); +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); +void cpuset_update_tasks_nodemask(struct cpuset *cs); +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); @@ -279,14 +279,14 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v); /* * cpuset-v1.c */ -extern struct cftype legacy_files[]; +extern struct cftype cpuset1_files[]; void fmeter_init(struct fmeter *fmp); -void cpuset_update_task_spread_flags(struct cpuset *cs, +void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); -void update_tasks_flags(struct cpuset *cs); -void hotplug_update_tasks_legacy(struct cpuset *cs, +void cpuset1_update_tasks_flags(struct cpuset *cs); +void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); -int validate_change_legacy(struct cpuset *cur, struct cpuset *trial); +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 0ccc440c468a..25c1d7b77e2f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -209,7 +209,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) * Call with callback_lock or cpuset_mutex held. The check can be skipped * if on default hierarchy. */ -void cpuset_update_task_spread_flags(struct cpuset *cs, +void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) { if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) @@ -227,21 +227,21 @@ void cpuset_update_task_spread_flags(struct cpuset *cs, } /** - * update_tasks_flags - update the spread flags of tasks in the cpuset. + * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this * function is called with cpuset_mutex held, cpuset membership stays * stable. */ -void update_tasks_flags(struct cpuset *cs) +void cpuset1_update_tasks_flags(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flags(cs, task); + cpuset1_update_task_spread_flags(cs, task); css_task_iter_end(&it); } @@ -282,27 +282,27 @@ static void cpuset_migrate_tasks_workfn(struct work_struct *work) kfree(s); } -void hotplug_update_tasks_legacy(struct cpuset *cs, +void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { bool is_empty; - callback_lock_irq(); + cpuset_callback_lock_irq(); cpumask_copy(cs->cpus_allowed, new_cpus); cpumask_copy(cs->effective_cpus, new_cpus); cs->mems_allowed = *new_mems; cs->effective_mems = *new_mems; - callback_unlock_irq(); + cpuset_callback_unlock_irq(); /* - * Don't call update_tasks_cpumask() if the cpuset becomes empty, + * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty, * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs, new_cpus); + cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated && !nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs); + cpuset_update_tasks_nodemask(cs); is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); @@ -345,10 +345,10 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) } /* - * validate_change_legacy() - Validate conditions specific to legacy (v1) + * cpuset1_validate_change() - Validate conditions specific to legacy (v1) * behavior. */ -int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; @@ -421,28 +421,28 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, switch (type) { case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); + retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); + retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); + retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); + retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); + retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); + retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); break; case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); + retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); break; default: retval = -EINVAL; @@ -458,7 +458,7 @@ out_unlock: * for the common functions, 'private' gives the type of file */ -struct cftype legacy_files[] = { +struct cftype cpuset1_files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f81c8fdbc18d..c14fe8283a73 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -239,12 +239,12 @@ void cpuset_unlock(void) static DEFINE_SPINLOCK(callback_lock); -void callback_lock_irq(void) +void cpuset_callback_lock_irq(void) { spin_lock_irq(&callback_lock); } -void callback_unlock_irq(void) +void cpuset_callback_unlock_irq(void) { spin_unlock_irq(&callback_lock); } @@ -540,7 +540,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) rcu_read_lock(); if (!is_in_v2_mode()) - ret = validate_change_legacy(cur, trial); + ret = cpuset1_validate_change(cur, trial); if (ret) goto out; @@ -1053,7 +1053,7 @@ void rebuild_sched_domains(void) } /** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * @@ -1063,7 +1063,7 @@ void rebuild_sched_domains(void) * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. */ -void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1126,11 +1126,11 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs) bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { - if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) return PERR_NOTEXCL; } else if (!exclusive && is_cpu_exclusive(cs)) { /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); } return 0; } @@ -1380,7 +1380,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return 0; } @@ -1416,7 +1416,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); } @@ -1468,7 +1468,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return; @@ -1840,7 +1840,7 @@ write_error: update_partition_exclusive(cs, new_prs); if (adding || deleting) { - update_tasks_cpumask(parent, tmp->addmask); + cpuset_update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); } @@ -2023,7 +2023,7 @@ update_parent_effective: /* * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call - * update_tasks_cpumask() again for tasks in the parent + * cpuset_update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ if ((cp != cs) && old_prs) { @@ -2080,7 +2080,7 @@ get_css: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp, cp->effective_cpus); + cpuset_update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE @@ -2507,14 +2507,14 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void *cpuset_being_rebound; /** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ -void update_tasks_nodemask(struct cpuset *cs) +void cpuset_update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; @@ -2612,7 +2612,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); - update_tasks_nodemask(cp); + cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -2699,7 +2699,7 @@ bool current_cpuset_is_being_rebound(void) } /* - * update_flag - read a 0 or a 1 in a file and update associated flag + * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared @@ -2707,7 +2707,7 @@ bool current_cpuset_is_being_rebound(void) * Call with cpuset_mutex held. */ -int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; @@ -2743,7 +2743,7 @@ int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, rebuild_sched_domains_locked(); if (spread_flag_changed) - update_tasks_flags(cs); + cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; @@ -3008,7 +3008,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); + cpuset1_update_task_spread_flags(cs, task); } static void cpuset_attach(struct cgroup_taskset *tset) @@ -3484,7 +3484,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); @@ -3623,7 +3623,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, - .legacy_cftypes = legacy_files, + .legacy_cftypes = cpuset1_files, .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, @@ -3683,9 +3683,9 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs, new_cpus); + cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) - update_tasks_nodemask(cs); + cpuset_update_tasks_nodemask(cs); } void cpuset_force_rebuild(void) @@ -3786,7 +3786,7 @@ update_tasks: hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else - hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); unlock: @@ -3871,7 +3871,7 @@ static void cpuset_handle_hotplug(void) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); - update_tasks_nodemask(&top_cpuset); + cpuset_update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); From 1abab1ba0775036bb67c6c57945c637be644c04f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:28 +0000 Subject: [PATCH 29/34] cgroup/cpuset: guard cpuset-v1 code under CONFIG_CPUSETS_V1 This patch introduces CONFIG_CPUSETS_V1 and guard cpuset-v1 code under CONFIG_CPUSETS_V1. The default value of CONFIG_CPUSETS_V1 is N, so that user who adopted v2 don't have 'pay' for cpuset v1. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 4 ++++ init/Kconfig | 13 +++++++++++++ kernel/cgroup/Makefile | 3 ++- kernel/cgroup/cpuset-internal.h | 12 ++++++++++++ kernel/cgroup/cpuset.c | 2 ++ 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2a6981eeebf8..835e7b793f6a 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -99,6 +99,7 @@ static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2); +#ifdef CONFIG_CPUSETS_V1 #define cpuset_memory_pressure_bump() \ do { \ if (cpuset_memory_pressure_enabled) \ @@ -106,6 +107,9 @@ extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, } while (0) extern int cpuset_memory_pressure_enabled; extern void __cpuset_memory_pressure_bump(void); +#else +static inline void cpuset_memory_pressure_bump(void) { } +#endif extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); diff --git a/init/Kconfig b/init/Kconfig index a465ea9525bd..8bf091354bea 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1143,6 +1143,19 @@ config CPUSETS Say N if unsure. +config CPUSETS_V1 + bool "Legacy cgroup v1 cpusets controller" + depends on CPUSETS + default n + help + Legacy cgroup v1 cpusets controller which has been deprecated by + cgroup v2 implementation. The v1 is there for legacy applications + which haven't migrated to the new cgroup v2 interface yet. If you + do not have any such application then you are completely fine leaving + this option disabled. + + Say N if unsure. + config PROC_PID_CPUSET bool "Include legacy /proc//cpuset file" depends on CPUSETS diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 005ac4c675cb..a5c9359d516f 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -4,6 +4,7 @@ obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o -obj-$(CONFIG_CPUSETS) += cpuset.o cpuset-v1.o +obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o obj-$(CONFIG_CGROUP_MISC) += misc.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index f36419d688bd..8c113d46ddd3 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -279,6 +279,7 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v); /* * cpuset-v1.c */ +#ifdef CONFIG_CPUSETS_V1 extern struct cftype cpuset1_files[]; void fmeter_init(struct fmeter *fmp); void cpuset1_update_task_spread_flags(struct cpuset *cs, @@ -288,5 +289,16 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +#else +static inline void fmeter_init(struct fmeter *fmp) {} +static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) {} +static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} +static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) {} +static inline int cpuset1_validate_change(struct cpuset *cur, + struct cpuset *trial) { return 0; } +#endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c14fe8283a73..13016ad284a1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3623,7 +3623,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, +#ifdef CONFIG_CPUSETS_V1 .legacy_cftypes = cpuset1_files, +#endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, From 3f9319c6914c45a2d406faf3aa8cd0dee8bc4c2f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:29 +0000 Subject: [PATCH 30/34] cgroup/cpuset: add sefltest for cpuset v1 There is only hotplug test for cpuset v1, just add base read/write test for cpuset v1. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- MAINTAINERS | 1 + .../selftests/cgroup/test_cpuset_v1_base.sh | 77 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100755 tools/testing/selftests/cgroup/test_cpuset_v1_base.sh diff --git a/MAINTAINERS b/MAINTAINERS index 3b5ec1cafd95..b59f54e1e30d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5703,6 +5703,7 @@ F: kernel/cgroup/cpuset-v1.c F: kernel/cgroup/cpuset.c F: tools/testing/selftests/cgroup/test_cpuset.c F: tools/testing/selftests/cgroup/test_cpuset_prs.sh +F: tools/testing/selftests/cgroup/test_cpuset_v1_base.sh CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh new file mode 100755 index 000000000000..42a6628fb8bc --- /dev/null +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Basc test for cpuset v1 interfaces write/read +# + +skip_test() { + echo "$1" + echo "Test SKIPPED" + exit 4 # ksft_skip +} + +write_test() { + dir=$1 + interface=$2 + value=$3 + original=$(cat $dir/$interface) + echo "testing $interface $value" + echo $value > $dir/$interface + new=$(cat $dir/$interface) + [[ $value -ne $(cat $dir/$interface) ]] && { + echo "$interface write $value failed: new:$new" + exit 1 + } +} + +[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!" + +# Find cpuset v1 mount point +CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk '{print $3}') +[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!" + +# +# Create a test cpuset, read write test +# +TDIR=test$$ +[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR + +ITF_MATRIX=( + #interface value expect root_only + 'cpuset.cpus 0-1 0-1 0' + 'cpuset.mem_exclusive 1 1 0' + 'cpuset.mem_exclusive 0 0 0' + 'cpuset.mem_hardwall 1 1 0' + 'cpuset.mem_hardwall 0 0 0' + 'cpuset.memory_migrate 1 1 0' + 'cpuset.memory_migrate 0 0 0' + 'cpuset.memory_spread_page 1 1 0' + 'cpuset.memory_spread_page 0 0 0' + 'cpuset.memory_spread_slab 1 1 0' + 'cpuset.memory_spread_slab 0 0 0' + 'cpuset.mems 0 0 0' + 'cpuset.sched_load_balance 1 1 0' + 'cpuset.sched_load_balance 0 0 0' + 'cpuset.sched_relax_domain_level 2 2 0' + 'cpuset.memory_pressure_enabled 1 1 1' + 'cpuset.memory_pressure_enabled 0 0 1' +) + +run_test() +{ + cnt="${ITF_MATRIX[@]}" + for i in "${ITF_MATRIX[@]}" ; do + args=($i) + root_only=${args[3]} + [[ $root_only -eq 1 ]] && { + write_test "$CPUSET" "${args[0]}" "${args[1]}" "${args[2]}" + continue + } + write_test "$CPUSET/$TDIR" "${args[0]}" "${args[1]}" "${args[2]}" + done +} + +run_test +rmdir $CPUSET/$TDIR +echo "Test PASSED" +exit 0 From 8c7e22fc917a0d76794ebf3fcd81f9d91cee4f5d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 31 Aug 2024 09:57:03 -0400 Subject: [PATCH 31/34] cgroup/cpuset: Move cpu.h include to cpuset-internal.h The newly created cpuset-v1.c file uses cpus_read_lock/unlock() functions which are defined in cpu.h but not included in cpuset-internal.h yet leading to compilation error under certain kernel configurations. Fix it by moving the cpu.h include from cpuset.c to cpuset-internal.h. While at it, sort the include files in alphabetic order. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408311612.mQTuO946-lkp@intel.com/ Fixes: 047b83097448 ("cgroup/cpuset: move relax_domain_level to cpuset-v1.c") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 9 +++++---- kernel/cgroup/cpuset.c | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 8c113d46ddd3..976a8bc3ff60 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -3,11 +3,12 @@ #ifndef __CPUSET_INTERNAL_H #define __CPUSET_INTERNAL_H -#include -#include -#include -#include #include +#include +#include +#include +#include +#include /* See "Frequency meter" comments, below. */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 13016ad284a1..a4dd285cdf39 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -24,7 +24,6 @@ #include "cgroup-internal.h" #include "cpuset-internal.h" -#include #include #include #include From 659f90f863a628c007c49aff97b30cb5908a0480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 9 Sep 2024 18:32:21 +0200 Subject: [PATCH 32/34] cgroup/cpuset: Expose cpuset filesystem with cpuset v1 only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpuset filesystem is a legacy interface to cpuset controller with (pre-)v1 features. It makes little sense to co-mount it on systems without cpuset v1, so do not build it when cpuset v1 is not built neither. Signed-off-by: Michal Koutný Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c72e18ffbfd8..90e50d6d3cf3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2331,7 +2331,7 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, @@ -6236,7 +6236,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 WARN_ON(register_filesystem(&cpuset_fs_type)); #endif From 3c41382e920f1dd5c9f432948fe799c07af1cced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 9 Sep 2024 18:32:22 +0200 Subject: [PATCH 33/34] cgroup: Disallow mounting v1 hierarchies without controller implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The configs that disable some v1 controllers would still allow mounting them but with no controller-specific files. (Making such hierarchies equivalent to named v1 hierarchies.) To achieve behavior consistent with actual out-compilation of a whole controller, the mounts should treat respective controllers as non-existent. Wrap implementation into a helper function, leverage legacy_files to detect compiled out controllers. The effect is that mounts on v1 would fail and produce a message like: [ 1543.999081] cgroup: Unknown subsys name 'memory' Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index b9dbf6bf2779..784337694a4b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -46,6 +46,12 @@ bool cgroup1_ssid_disabled(int ssid) return cgroup_no_v1_mask & (1 << ssid); } +static bool cgroup1_subsys_absent(struct cgroup_subsys *ss) +{ + /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */ + return ss->legacy_cftypes == NULL && ss->dfl_cftypes; +} + /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task @@ -932,7 +938,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { - if (strcmp(param->key, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name) || + cgroup1_subsys_absent(ss)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", @@ -1024,7 +1031,8 @@ static int check_cgroupfs_options(struct fs_context *fc) mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && + !cgroup1_subsys_absent(ss)) enabled |= 1 << i; ctx->subsys_mask &= enabled; From af000ce85293b8e608f696f0c6c280bc3a75887f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 9 Sep 2024 18:32:23 +0200 Subject: [PATCH 34/34] cgroup: Do not report unavailable v1 controllers in /proc/cgroups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a followup to CONFIG-urability of cpuset and memory controllers for v1 hierarchies. Make the output in /proc/cgroups reflect that !CONFIG_CPUSETS_V1 is like !CONFIG_CPUSETS and !CONFIG_MEMCG_V1 is like !CONFIG_MEMCG. The intended effect is that hiding the unavailable controllers will hint users not to try mounting them on v1. Signed-off-by: Michal Koutný Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 784337694a4b..e28d5f0d20ed 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -681,11 +681,14 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * cgroup_mutex contention. */ - for_each_subsys(ss, i) + for_each_subsys(ss, i) { + if (cgroup1_subsys_absent(ss)) + continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); + } return 0; }