2019-06-01 08:08:42 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2013-11-24 14:54:58 +00:00
|
|
|
/*
|
|
|
|
* fs/kernfs/mount.c - kernfs mount implementation
|
|
|
|
*
|
|
|
|
* Copyright (c) 2001-3 Patrick Mochel
|
|
|
|
* Copyright (c) 2007 SUSE Linux Products GmbH
|
|
|
|
* Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
|
|
|
|
*/
|
2013-11-28 19:54:44 +00:00
|
|
|
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/magic.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/pagemap.h>
|
2016-01-29 08:54:08 +00:00
|
|
|
#include <linux/namei.h>
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
#include <linux/seq_file.h>
|
2017-07-12 18:49:51 +00:00
|
|
|
#include <linux/exportfs.h>
|
2023-07-31 18:47:31 +00:00
|
|
|
#include <linux/uuid.h>
|
|
|
|
#include <linux/statfs.h>
|
2013-11-28 19:54:44 +00:00
|
|
|
|
|
|
|
#include "kernfs-internal.h"
|
|
|
|
|
2023-10-11 16:55:00 +00:00
|
|
|
struct kmem_cache *kernfs_node_cache __ro_after_init;
|
|
|
|
struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
|
|
|
|
struct kernfs_global_locks *kernfs_locks __ro_after_init;
|
2013-11-28 19:54:44 +00:00
|
|
|
|
2014-02-03 19:09:10 +00:00
|
|
|
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
|
|
|
|
{
|
2017-07-12 18:49:49 +00:00
|
|
|
struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry));
|
2014-02-03 19:09:10 +00:00
|
|
|
struct kernfs_syscall_ops *scops = root->syscall_ops;
|
|
|
|
|
|
|
|
if (scops && scops->show_options)
|
|
|
|
return scops->show_options(sf, root);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
|
|
|
|
{
|
2017-07-12 18:49:49 +00:00
|
|
|
struct kernfs_node *node = kernfs_dentry_node(dentry);
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
struct kernfs_root *root = kernfs_root(node);
|
|
|
|
struct kernfs_syscall_ops *scops = root->syscall_ops;
|
|
|
|
|
|
|
|
if (scops && scops->show_path)
|
|
|
|
return scops->show_path(sf, node, root);
|
|
|
|
|
2016-05-12 05:29:45 +00:00
|
|
|
seq_dentry(sf, dentry, " \t\n\\");
|
|
|
|
return 0;
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
}
|
|
|
|
|
2023-07-31 18:47:31 +00:00
|
|
|
static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf)
|
|
|
|
{
|
|
|
|
simple_statfs(dentry, buf);
|
|
|
|
buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-02-14 08:57:27 +00:00
|
|
|
const struct super_operations kernfs_sops = {
|
2023-07-31 18:47:31 +00:00
|
|
|
.statfs = kernfs_statfs,
|
2013-11-28 19:54:44 +00:00
|
|
|
.drop_inode = generic_delete_inode,
|
2013-12-11 19:11:58 +00:00
|
|
|
.evict_inode = kernfs_evict_inode,
|
2014-02-03 19:09:10 +00:00
|
|
|
|
|
|
|
.show_options = kernfs_sop_show_options,
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
.show_path = kernfs_sop_show_path,
|
2013-11-28 19:54:44 +00:00
|
|
|
};
|
|
|
|
|
2019-11-04 23:54:30 +00:00
|
|
|
static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
|
|
|
|
struct inode *parent)
|
2017-07-12 18:49:51 +00:00
|
|
|
{
|
2019-11-04 23:54:30 +00:00
|
|
|
struct kernfs_node *kn = inode->i_private;
|
2017-07-12 18:49:51 +00:00
|
|
|
|
2019-11-04 23:54:30 +00:00
|
|
|
if (*max_len < 2) {
|
|
|
|
*max_len = 2;
|
|
|
|
return FILEID_INVALID;
|
|
|
|
}
|
2017-07-12 18:49:51 +00:00
|
|
|
|
2019-11-04 23:54:30 +00:00
|
|
|
*max_len = 2;
|
|
|
|
*(u64 *)fh = kn->id;
|
|
|
|
return FILEID_KERNFS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb,
|
|
|
|
struct fid *fid, int fh_len,
|
|
|
|
int fh_type, bool get_parent)
|
|
|
|
{
|
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
|
|
|
struct kernfs_node *kn;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 id;
|
|
|
|
|
|
|
|
if (fh_len < 2)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
switch (fh_type) {
|
|
|
|
case FILEID_KERNFS:
|
|
|
|
id = *(u64 *)fid;
|
|
|
|
break;
|
|
|
|
case FILEID_INO32_GEN:
|
|
|
|
case FILEID_INO32_GEN_PARENT:
|
|
|
|
/*
|
2019-11-04 23:54:30 +00:00
|
|
|
* blk_log_action() exposes "LOW32,HIGH32" pair without
|
|
|
|
* type and userland can call us with generic fid
|
|
|
|
* constructed from them. Combine it back to ID. See
|
|
|
|
* blk_log_action().
|
2019-11-04 23:54:30 +00:00
|
|
|
*/
|
|
|
|
id = ((u64)fid->i32.gen << 32) | fid->i32.ino;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
kn = kernfs_find_and_get_node_by_id(info->root, id);
|
2017-07-12 18:49:51 +00:00
|
|
|
if (!kn)
|
|
|
|
return ERR_PTR(-ESTALE);
|
2019-11-04 23:54:30 +00:00
|
|
|
|
|
|
|
if (get_parent) {
|
|
|
|
struct kernfs_node *parent;
|
|
|
|
|
|
|
|
parent = kernfs_get_parent(kn);
|
|
|
|
kernfs_put(kn);
|
|
|
|
kn = parent;
|
|
|
|
if (!kn)
|
|
|
|
return ERR_PTR(-ESTALE);
|
|
|
|
}
|
|
|
|
|
2017-07-12 18:49:51 +00:00
|
|
|
inode = kernfs_get_inode(sb, kn);
|
|
|
|
kernfs_put(kn);
|
2019-11-04 23:54:30 +00:00
|
|
|
return d_obtain_alias(inode);
|
2017-07-12 18:49:51 +00:00
|
|
|
}
|
|
|
|
|
2019-11-04 23:54:30 +00:00
|
|
|
static struct dentry *kernfs_fh_to_dentry(struct super_block *sb,
|
|
|
|
struct fid *fid, int fh_len,
|
|
|
|
int fh_type)
|
2017-07-12 18:49:51 +00:00
|
|
|
{
|
2019-11-04 23:54:30 +00:00
|
|
|
return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false);
|
2017-07-12 18:49:51 +00:00
|
|
|
}
|
|
|
|
|
2019-11-04 23:54:30 +00:00
|
|
|
static struct dentry *kernfs_fh_to_parent(struct super_block *sb,
|
|
|
|
struct fid *fid, int fh_len,
|
|
|
|
int fh_type)
|
2017-07-12 18:49:51 +00:00
|
|
|
{
|
2019-11-04 23:54:30 +00:00
|
|
|
return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true);
|
2017-07-12 18:49:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
|
|
|
|
{
|
|
|
|
struct kernfs_node *kn = kernfs_dentry_node(child);
|
|
|
|
|
|
|
|
return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent));
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct export_operations kernfs_export_ops = {
|
2019-11-04 23:54:30 +00:00
|
|
|
.encode_fh = kernfs_encode_fh,
|
2017-07-12 18:49:51 +00:00
|
|
|
.fh_to_dentry = kernfs_fh_to_dentry,
|
|
|
|
.fh_to_parent = kernfs_fh_to_parent,
|
|
|
|
.get_parent = kernfs_get_parent_dentry,
|
|
|
|
};
|
|
|
|
|
2014-02-03 19:09:15 +00:00
|
|
|
/**
|
|
|
|
* kernfs_root_from_sb - determine kernfs_root associated with a super_block
|
|
|
|
* @sb: the super_block in question
|
|
|
|
*
|
2022-11-12 03:14:56 +00:00
|
|
|
* Return: the kernfs_root associated with @sb. If @sb is not a kernfs one,
|
2014-02-03 19:09:15 +00:00
|
|
|
* %NULL is returned.
|
|
|
|
*/
|
|
|
|
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
|
|
|
|
{
|
|
|
|
if (sb->s_op == &kernfs_sops)
|
|
|
|
return kernfs_info(sb)->root;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-01-29 08:54:08 +00:00
|
|
|
/*
|
|
|
|
* find the next ancestor in the path down to @child, where @parent was the
|
|
|
|
* ancestor whose descendant we want to find.
|
|
|
|
*
|
2022-11-12 03:14:56 +00:00
|
|
|
* Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root
|
2016-01-29 08:54:08 +00:00
|
|
|
* node. If @parent is b, then we return the node for c.
|
|
|
|
* Passing in d as @parent is not ok.
|
|
|
|
*/
|
|
|
|
static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
|
|
|
|
struct kernfs_node *parent)
|
|
|
|
{
|
|
|
|
if (child == parent) {
|
|
|
|
pr_crit_once("BUG in find_next_ancestor: called with parent == child");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (child->parent != parent) {
|
|
|
|
if (!child->parent)
|
|
|
|
return NULL;
|
|
|
|
child = child->parent;
|
|
|
|
}
|
|
|
|
|
|
|
|
return child;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kernfs_node_dentry - get a dentry for the given kernfs_node
|
|
|
|
* @kn: kernfs_node for which a dentry is needed
|
|
|
|
* @sb: the kernfs super_block
|
2022-11-12 03:14:56 +00:00
|
|
|
*
|
|
|
|
* Return: the dentry pointer
|
2016-01-29 08:54:08 +00:00
|
|
|
*/
|
|
|
|
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
|
|
|
|
struct super_block *sb)
|
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
2024-04-15 10:20:09 +00:00
|
|
|
struct kernfs_node *knparent;
|
2016-01-29 08:54:08 +00:00
|
|
|
|
|
|
|
BUG_ON(sb->s_op != &kernfs_sops);
|
|
|
|
|
|
|
|
dentry = dget(sb->s_root);
|
|
|
|
|
|
|
|
/* Check if this is the root kernfs_node */
|
|
|
|
if (!kn->parent)
|
|
|
|
return dentry;
|
|
|
|
|
|
|
|
knparent = find_next_ancestor(kn, NULL);
|
2019-01-06 16:41:29 +00:00
|
|
|
if (WARN_ON(!knparent)) {
|
|
|
|
dput(dentry);
|
2016-01-29 08:54:08 +00:00
|
|
|
return ERR_PTR(-EINVAL);
|
2019-01-06 16:41:29 +00:00
|
|
|
}
|
2016-01-29 08:54:08 +00:00
|
|
|
|
|
|
|
do {
|
|
|
|
struct dentry *dtmp;
|
|
|
|
struct kernfs_node *kntmp;
|
|
|
|
|
|
|
|
if (kn == knparent)
|
|
|
|
return dentry;
|
|
|
|
kntmp = find_next_ancestor(kn, knparent);
|
2019-01-06 16:41:29 +00:00
|
|
|
if (WARN_ON(!kntmp)) {
|
|
|
|
dput(dentry);
|
2016-01-29 08:54:08 +00:00
|
|
|
return ERR_PTR(-EINVAL);
|
2019-01-06 16:41:29 +00:00
|
|
|
}
|
2019-10-31 05:21:58 +00:00
|
|
|
dtmp = lookup_positive_unlocked(kntmp->name, dentry,
|
2016-04-11 12:42:55 +00:00
|
|
|
strlen(kntmp->name));
|
2016-01-29 08:54:08 +00:00
|
|
|
dput(dentry);
|
|
|
|
if (IS_ERR(dtmp))
|
|
|
|
return dtmp;
|
|
|
|
knparent = kntmp;
|
|
|
|
dentry = dtmp;
|
|
|
|
} while (true);
|
|
|
|
}
|
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
|
2013-11-28 19:54:44 +00:00
|
|
|
{
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
2021-11-18 23:00:08 +00:00
|
|
|
struct kernfs_root *kf_root = kfc->root;
|
2013-11-28 19:54:44 +00:00
|
|
|
struct inode *inode;
|
|
|
|
struct dentry *root;
|
|
|
|
|
2014-04-09 15:07:30 +00:00
|
|
|
info->sb = sb;
|
2016-06-09 20:34:02 +00:00
|
|
|
/* Userspace would break if executables or devices appear on sysfs */
|
|
|
|
sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
sb->s_blocksize = PAGE_SIZE;
|
|
|
|
sb->s_blocksize_bits = PAGE_SHIFT;
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
sb->s_magic = kfc->magic;
|
2013-12-11 19:11:57 +00:00
|
|
|
sb->s_op = &kernfs_sops;
|
2016-09-29 15:48:33 +00:00
|
|
|
sb->s_xattr = kernfs_xattr_handlers;
|
2017-07-12 18:49:51 +00:00
|
|
|
if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
|
|
|
|
sb->s_export_op = &kernfs_export_ops;
|
2013-11-28 19:54:44 +00:00
|
|
|
sb->s_time_gran = 1;
|
|
|
|
|
mm: zero-seek shrinkers
The page cache and most shrinkable slab caches hold data that has been
read from disk, but there are some caches that only cache CPU work, such
as the dentry and inode caches of procfs and sysfs, as well as the subset
of radix tree nodes that track non-resident page cache.
Currently, all these are shrunk at the same rate: using DEFAULT_SEEKS for
the shrinker's seeks setting tells the reclaim algorithm that for every
two page cache pages scanned it should scan one slab object.
This is a bogus setting. A virtual inode that required no IO to create is
not twice as valuable as a page cache page; shadow cache entries with
eviction distances beyond the size of memory aren't either.
In most cases, the behavior in practice is still fine. Such virtual
caches don't tend to grow and assert themselves aggressively, and usually
get picked up before they cause problems. But there are scenarios where
that's not true.
Our database workloads suffer from two of those. For one, their file
workingset is several times bigger than available memory, which has the
kernel aggressively create shadow page cache entries for the non-resident
parts of it. The workingset code does tell the VM that most of these are
expendable, but the VM ends up balancing them 2:1 to cache pages as per
the seeks setting. This is a huge waste of memory.
These workloads also deal with tens of thousands of open files and use
/proc for introspection, which ends up growing the proc_inode_cache to
absurdly large sizes - again at the cost of valuable cache space, which
isn't a reasonable trade-off, given that proc inodes can be re-created
without involving the disk.
This patch implements a "zero-seek" setting for shrinkers that results in
a target ratio of 0:1 between their objects and IO-backed caches. This
allows such virtual caches to grow when memory is available (they do
cache/avoid CPU work after all), but effectively disables them as soon as
IO-backed objects are under pressure.
It then switches the shrinkers for procfs and sysfs metadata, as well as
excess page cache shadow nodes, to the new zero-seek setting.
Link: http://lkml.kernel.org/r/20181009184732.762-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Domas Mituzas <dmituzas@fb.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 22:06:42 +00:00
|
|
|
/* sysfs dentries and inodes don't require IO to create */
|
2023-09-11 09:44:37 +00:00
|
|
|
sb->s_shrink->seeks = 0;
|
mm: zero-seek shrinkers
The page cache and most shrinkable slab caches hold data that has been
read from disk, but there are some caches that only cache CPU work, such
as the dentry and inode caches of procfs and sysfs, as well as the subset
of radix tree nodes that track non-resident page cache.
Currently, all these are shrunk at the same rate: using DEFAULT_SEEKS for
the shrinker's seeks setting tells the reclaim algorithm that for every
two page cache pages scanned it should scan one slab object.
This is a bogus setting. A virtual inode that required no IO to create is
not twice as valuable as a page cache page; shadow cache entries with
eviction distances beyond the size of memory aren't either.
In most cases, the behavior in practice is still fine. Such virtual
caches don't tend to grow and assert themselves aggressively, and usually
get picked up before they cause problems. But there are scenarios where
that's not true.
Our database workloads suffer from two of those. For one, their file
workingset is several times bigger than available memory, which has the
kernel aggressively create shadow page cache entries for the non-resident
parts of it. The workingset code does tell the VM that most of these are
expendable, but the VM ends up balancing them 2:1 to cache pages as per
the seeks setting. This is a huge waste of memory.
These workloads also deal with tens of thousands of open files and use
/proc for introspection, which ends up growing the proc_inode_cache to
absurdly large sizes - again at the cost of valuable cache space, which
isn't a reasonable trade-off, given that proc inodes can be re-created
without involving the disk.
This patch implements a "zero-seek" setting for shrinkers that results in
a target ratio of 0:1 between their objects and IO-backed caches. This
allows such virtual caches to grow when memory is available (they do
cache/avoid CPU work after all), but effectively disables them as soon as
IO-backed objects are under pressure.
It then switches the shrinkers for procfs and sysfs metadata, as well as
excess page cache shadow nodes, to the new zero-seek setting.
Link: http://lkml.kernel.org/r/20181009184732.762-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Domas Mituzas <dmituzas@fb.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 22:06:42 +00:00
|
|
|
|
2013-11-28 19:54:44 +00:00
|
|
|
/* get root inode, initialize and unlock it */
|
2021-11-18 23:00:08 +00:00
|
|
|
down_read(&kf_root->kernfs_rwsem);
|
2013-12-11 19:11:58 +00:00
|
|
|
inode = kernfs_get_inode(sb, info->root->kn);
|
2021-11-18 23:00:08 +00:00
|
|
|
up_read(&kf_root->kernfs_rwsem);
|
2013-11-28 19:54:44 +00:00
|
|
|
if (!inode) {
|
2013-12-11 19:11:58 +00:00
|
|
|
pr_debug("kernfs: could not get root inode\n");
|
2013-11-28 19:54:44 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* instantiate and link root dentry */
|
|
|
|
root = d_make_root(inode);
|
|
|
|
if (!root) {
|
|
|
|
pr_debug("%s: could not get root dentry!\n", __func__);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
sb->s_root = root;
|
2013-12-11 19:11:57 +00:00
|
|
|
sb->s_d_op = &kernfs_dops;
|
2013-11-28 19:54:44 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
|
2013-11-28 19:54:44 +00:00
|
|
|
{
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *sb_info = kernfs_info(sb);
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
struct kernfs_super_info *info = fc->s_fs_info;
|
2013-11-28 19:54:44 +00:00
|
|
|
|
|
|
|
return sb_info->root == info->root && sb_info->ns == info->ns;
|
|
|
|
}
|
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
|
2013-11-28 19:54:44 +00:00
|
|
|
{
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
struct kernfs_fs_context *kfc = fc->fs_private;
|
|
|
|
|
|
|
|
kfc->ns_tag = NULL;
|
|
|
|
return set_anon_super_fc(sb, fc);
|
2013-11-28 19:54:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kernfs_super_ns - determine the namespace tag of a kernfs super_block
|
|
|
|
* @sb: super_block of interest
|
|
|
|
*
|
2022-11-12 03:14:56 +00:00
|
|
|
* Return: the namespace tag associated with kernfs super_block @sb.
|
2013-11-28 19:54:44 +00:00
|
|
|
*/
|
|
|
|
const void *kernfs_super_ns(struct super_block *sb)
|
|
|
|
{
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
2013-11-28 19:54:44 +00:00
|
|
|
|
|
|
|
return info->ns;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
* kernfs_get_tree - kernfs filesystem access/retrieval helper
|
|
|
|
* @fc: The filesystem context.
|
2013-11-28 19:54:44 +00:00
|
|
|
*
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
* This is to be called from each kernfs user's fs_context->ops->get_tree()
|
|
|
|
* implementation, which should set the specified ->@fs_type and ->@flags, and
|
|
|
|
* specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
|
|
|
|
* respectively.
|
2022-11-12 03:14:56 +00:00
|
|
|
*
|
|
|
|
* Return: %0 on success, -errno on failure.
|
2013-11-28 19:54:44 +00:00
|
|
|
*/
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
int kernfs_get_tree(struct fs_context *fc)
|
2013-11-28 19:54:44 +00:00
|
|
|
{
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
struct kernfs_fs_context *kfc = fc->fs_private;
|
2013-11-28 19:54:44 +00:00
|
|
|
struct super_block *sb;
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *info;
|
2013-11-28 19:54:44 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
info = kzalloc(sizeof(*info), GFP_KERNEL);
|
|
|
|
if (!info)
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
return -ENOMEM;
|
2013-11-28 19:54:44 +00:00
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
info->root = kfc->root;
|
|
|
|
info->ns = kfc->ns_tag;
|
2018-04-03 04:22:29 +00:00
|
|
|
INIT_LIST_HEAD(&info->node);
|
2013-11-28 19:54:44 +00:00
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
fc->s_fs_info = info;
|
|
|
|
sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
|
2013-11-28 19:54:44 +00:00
|
|
|
if (IS_ERR(sb))
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
return PTR_ERR(sb);
|
2014-02-25 11:28:44 +00:00
|
|
|
|
2013-11-28 19:54:44 +00:00
|
|
|
if (!sb->s_root) {
|
2014-04-09 15:07:30 +00:00
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
2021-11-18 23:00:08 +00:00
|
|
|
struct kernfs_root *root = kfc->root;
|
2014-04-09 15:07:30 +00:00
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
kfc->new_sb_created = true;
|
|
|
|
|
|
|
|
error = kernfs_fill_super(sb, kfc);
|
2013-11-28 19:54:44 +00:00
|
|
|
if (error) {
|
|
|
|
deactivate_locked_super(sb);
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
return error;
|
2013-11-28 19:54:44 +00:00
|
|
|
}
|
2017-11-27 21:05:09 +00:00
|
|
|
sb->s_flags |= SB_ACTIVE;
|
2014-04-09 15:07:30 +00:00
|
|
|
|
2024-02-07 02:56:15 +00:00
|
|
|
uuid_t uuid;
|
|
|
|
uuid_gen(&uuid);
|
|
|
|
super_set_uuid(sb, uuid.b, sizeof(uuid));
|
2023-07-31 18:47:31 +00:00
|
|
|
|
2023-03-09 11:09:31 +00:00
|
|
|
down_write(&root->kernfs_supers_rwsem);
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
list_add(&info->node, &info->root->supers);
|
2023-03-09 11:09:31 +00:00
|
|
|
up_write(&root->kernfs_supers_rwsem);
|
2013-11-28 19:54:44 +00:00
|
|
|
}
|
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
fc->root = dget(sb->s_root);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void kernfs_free_fs_context(struct fs_context *fc)
|
|
|
|
{
|
|
|
|
/* Note that we don't deal with kfc->ns_tag here. */
|
|
|
|
kfree(fc->s_fs_info);
|
|
|
|
fc->s_fs_info = NULL;
|
2013-11-28 19:54:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kernfs_kill_sb - kill_sb for kernfs
|
|
|
|
* @sb: super_block being killed
|
|
|
|
*
|
|
|
|
* This can be used directly for file_system_type->kill_sb(). If a kernfs
|
|
|
|
* user needs extra cleanup, it can implement its own kill_sb() and call
|
|
|
|
* this function at the end.
|
|
|
|
*/
|
|
|
|
void kernfs_kill_sb(struct super_block *sb)
|
|
|
|
{
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
2021-11-18 23:00:08 +00:00
|
|
|
struct kernfs_root *root = info->root;
|
2013-11-28 19:54:44 +00:00
|
|
|
|
2023-03-09 11:09:31 +00:00
|
|
|
down_write(&root->kernfs_supers_rwsem);
|
2014-04-09 15:07:30 +00:00
|
|
|
list_del(&info->node);
|
2023-03-09 11:09:31 +00:00
|
|
|
up_write(&root->kernfs_supers_rwsem);
|
2014-04-09 15:07:30 +00:00
|
|
|
|
2013-11-28 19:54:44 +00:00
|
|
|
/*
|
|
|
|
* Remove the superblock from fs_supers/s_instances
|
2013-12-11 19:11:55 +00:00
|
|
|
* so we can't find it, before freeing kernfs_super_info.
|
2013-11-28 19:54:44 +00:00
|
|
|
*/
|
|
|
|
kill_anon_super(sb);
|
|
|
|
kfree(info);
|
|
|
|
}
|
|
|
|
|
2022-06-15 02:10:59 +00:00
|
|
|
static void __init kernfs_mutex_init(void)
|
|
|
|
{
|
|
|
|
int count;
|
|
|
|
|
|
|
|
for (count = 0; count < NR_KERNFS_LOCKS; count++)
|
|
|
|
mutex_init(&kernfs_locks->open_file_mutex[count]);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init kernfs_lock_init(void)
|
|
|
|
{
|
|
|
|
kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL);
|
|
|
|
WARN_ON(!kernfs_locks);
|
|
|
|
|
|
|
|
kernfs_mutex_init();
|
|
|
|
}
|
|
|
|
|
2013-11-28 19:54:44 +00:00
|
|
|
void __init kernfs_init(void)
|
|
|
|
{
|
2013-12-11 19:11:57 +00:00
|
|
|
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
|
2013-12-11 19:11:53 +00:00
|
|
|
sizeof(struct kernfs_node),
|
2019-11-04 23:54:29 +00:00
|
|
|
0, SLAB_PANIC, NULL);
|
2019-02-06 04:55:42 +00:00
|
|
|
|
|
|
|
/* Creates slab cache for kernfs inode attributes */
|
|
|
|
kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
|
|
|
|
sizeof(struct kernfs_iattrs),
|
|
|
|
0, SLAB_PANIC, NULL);
|
2022-06-15 02:10:59 +00:00
|
|
|
|
|
|
|
kernfs_lock_init();
|
2013-11-28 19:54:44 +00:00
|
|
|
}
|