vfs-6.12.mount

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZuQEmwAKCRCRxhvAZXjc
 otRsAQCUdlBS/ky2JiYn3ePURKYVBgRq/+PnmhRrBNDuv+ToZwD+NRLNlOM8FzQy
 c8BMSq0rkwO2C5Aax3kGxgTPMEuuCwc=
 =QLvm
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.12.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs mount updates from Christian Brauner:
 "Recently, we added the ability to list mounts in other mount
  namespaces and the ability to retrieve namespace file descriptors
  without having to go through procfs by deriving them from pidfds.

  This extends nsfs in two ways:

   (1) Add the ability to retrieve information about a mount namespace
       via NS_MNT_GET_INFO.

       This will return the mount namespace id and the number of mounts
       currently in the mount namespace. The number of mounts can be
       used to size the buffer that needs to be used for listmount() and
       is in general useful without having to actually iterate through
       all the mounts.

      The structure is extensible.

   (2) Add the ability to iterate through all mount namespaces over
       which the caller holds privilege returning the file descriptor
       for the next or previous mount namespace.

       To retrieve a mount namespace the caller must be privileged wrt
       to it's owning user namespace. This means that PID 1 on the host
       can list all mounts in all mount namespaces or that a container
       can list all mounts of its nested containers.

       Optionally pass a structure for NS_MNT_GET_INFO with
       NS_MNT_GET_{PREV,NEXT} to retrieve information about the mount
       namespace in one go.

  (1) and (2) can be implemented for other namespace types easily.

  Together with recent api additions this means one can iterate through
  all mounts in all mount namespaces without ever touching procfs.

  The commit message in 49224a345c ('Merge patch series "nsfs: iterate
  through mount namespaces"') contains example code how to do this"

* tag 'vfs-6.12.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  nsfs: iterate through mount namespaces
  file: add fput() cleanup helper
  fs: add put_mnt_ns() cleanup helper
  fs: allow mount namespace fd
This commit is contained in:
Linus Torvalds 2024-09-16 11:15:26 +02:00
commit 9020d0d844
6 changed files with 198 additions and 14 deletions

View File

@ -154,3 +154,16 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
}
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, false);
}
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, true);
}
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}

View File

@ -2060,16 +2060,43 @@ static bool is_mnt_ns_file(struct dentry *dentry)
dentry->d_fsdata == &mntns_operations;
}
static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}
struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
return &mnt->ns;
}
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
guard(read_lock)(&mnt_ns_tree_lock);
for (;;) {
struct rb_node *node;
if (previous)
node = rb_prev(&mntns->mnt_ns_tree_node);
else
node = rb_next(&mntns->mnt_ns_tree_node);
if (!node)
return ERR_PTR(-ENOENT);
mntns = node_to_mnt_ns(node);
node = &mntns->mnt_ns_tree_node;
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
continue;
/*
* Holding mnt_ns_tree_lock prevents the mount namespace from
* being freed but it may well be on it's deathbed. We want an
* active reference, not just a passive one here as we're
* persisting the mount namespace.
*/
if (!refcount_inc_not_zero(&mntns->ns.count))
continue;
return mntns;
}
}
static bool mnt_ns_loop(struct dentry *dentry)
{
/* Could bind mounting the mount namespace inode cause a
@ -5251,12 +5278,37 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
* that, or if not simply grab a passive reference on our mount namespace and
* return that.
*/
static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
{
if (mnt_ns_id)
return lookup_mnt_ns(mnt_ns_id);
refcount_inc(&current->nsproxy->mnt_ns->passive);
return current->nsproxy->mnt_ns;
struct mnt_namespace *mnt_ns;
if (kreq->mnt_ns_id && kreq->spare)
return ERR_PTR(-EINVAL);
if (kreq->mnt_ns_id)
return lookup_mnt_ns(kreq->mnt_ns_id);
if (kreq->spare) {
struct ns_common *ns;
CLASS(fd, f)(kreq->spare);
if (!f.file)
return ERR_PTR(-EBADF);
if (!proc_ns_file(f.file))
return ERR_PTR(-EINVAL);
ns = get_proc_ns(file_inode(f.file));
if (ns->ops->type != CLONE_NEWNS)
return ERR_PTR(-EINVAL);
mnt_ns = to_mnt_ns(ns);
} else {
mnt_ns = current->nsproxy->mnt_ns;
}
refcount_inc(&mnt_ns->passive);
return mnt_ns;
}
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
@ -5277,7 +5329,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
ns = grab_requested_mnt_ns(&kreq);
if (!ns)
return -ENOENT;
@ -5404,7 +5456,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (!kmnt_ids)
return -ENOMEM;
ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
ns = grab_requested_mnt_ns(&kreq);
if (!ns)
return -ENOENT;

102
fs/nsfs.c
View File

@ -12,6 +12,7 @@
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>
#include <linux/mnt_namespace.h>
#include "mount.h"
#include "internal.h"
@ -128,6 +129,30 @@ int open_related_ns(struct ns_common *ns,
}
EXPORT_SYMBOL_GPL(open_related_ns);
static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
struct mnt_ns_info __user *uinfo, size_t usize,
struct mnt_ns_info *kinfo)
{
/*
* If userspace and the kernel have the same struct size it can just
* be copied. If userspace provides an older struct, only the bits that
* userspace knows about will be copied. If userspace provides a new
* struct, only the bits that the kernel knows aobut will be copied and
* the size value will be set to the size the kernel knows about.
*/
kinfo->size = min(usize, sizeof(*kinfo));
kinfo->mnt_ns_id = mnt_ns->seq;
kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts);
/* Subtract the root mount of the mount namespace. */
if (kinfo->nr_mounts)
kinfo->nr_mounts--;
if (copy_to_user(uinfo, kinfo, kinfo->size))
return -EFAULT;
return 0;
}
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
@ -135,6 +160,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct pid_namespace *pid_ns;
struct task_struct *tsk;
struct ns_common *ns = get_proc_ns(file_inode(filp));
struct mnt_namespace *mnt_ns;
bool previous = false;
uid_t __user *argp;
uid_t uid;
int ret;
@ -156,7 +183,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
case NS_GET_MNTNS_ID: {
struct mnt_namespace *mnt_ns;
__u64 __user *idp;
__u64 id;
@ -211,7 +237,79 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (!ret)
ret = -ESRCH;
break;
return ret;
}
}
/* extensible ioctls */
switch (_IOC_NR(ioctl)) {
case _IOC_NR(NS_MNT_GET_INFO): {
struct mnt_ns_info kinfo = {};
struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
size_t usize = _IOC_SIZE(ioctl);
if (ns->ops->type != CLONE_NEWNS)
return -EINVAL;
if (!uinfo)
return -EINVAL;
if (usize < MNT_NS_INFO_SIZE_VER0)
return -EINVAL;
return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
}
case _IOC_NR(NS_MNT_GET_PREV):
previous = true;
fallthrough;
case _IOC_NR(NS_MNT_GET_NEXT): {
struct mnt_ns_info kinfo = {};
struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
struct path path __free(path_put) = {};
struct file *f __free(fput) = NULL;
size_t usize = _IOC_SIZE(ioctl);
if (ns->ops->type != CLONE_NEWNS)
return -EINVAL;
if (usize < MNT_NS_INFO_SIZE_VER0)
return -EINVAL;
if (previous)
mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
else
mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
if (IS_ERR(mnt_ns))
return PTR_ERR(mnt_ns);
ns = to_ns_common(mnt_ns);
/* Transfer ownership of @mnt_ns reference to @path. */
ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
if (ret)
return ret;
CLASS(get_unused_fd, fd)(O_CLOEXEC);
if (fd < 0)
return fd;
f = dentry_open(&path, O_RDONLY, current_cred());
if (IS_ERR(f))
return PTR_ERR(f);
if (uinfo) {
/*
* If @uinfo is passed return all information about the
* mount namespace as well.
*/
ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
if (ret)
return ret;
}
/* Transfer reference of @f to caller's fdtable. */
fd_install(fd, no_free_ptr(f));
/* File descriptor is live so hand it off to the caller. */
return take_fd(fd);
}
default:
ret = -ENOTTY;

View File

@ -11,6 +11,7 @@
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>
#include <linux/err.h>
struct file;
@ -96,6 +97,7 @@ extern void put_unused_fd(unsigned int fd);
DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
get_unused_fd_flags(flags), unsigned flags)
DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))
/*
* take_fd() will take care to set @fd to -EBADF ensuring that

View File

@ -3,6 +3,9 @@
#define _NAMESPACE_H_
#ifdef __KERNEL__
#include <linux/cleanup.h>
#include <linux/err.h>
struct mnt_namespace;
struct fs_struct;
struct user_namespace;
@ -11,6 +14,7 @@ struct ns_common;
extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
struct user_namespace *, struct fs_struct *);
extern void put_mnt_ns(struct mnt_namespace *ns);
DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T))
extern struct ns_common *from_mnt_ns(struct mnt_namespace *);
extern const struct file_operations proc_mounts_operations;

View File

@ -27,4 +27,19 @@
/* Return thread-group leader id of pid in the target pid namespace. */
#define NS_GET_TGID_IN_PIDNS _IOR(NSIO, 0x9, int)
struct mnt_ns_info {
__u32 size;
__u32 nr_mounts;
__u64 mnt_ns_id;
};
#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
/* Get information about namespace. */
#define NS_MNT_GET_INFO _IOR(NSIO, 10, struct mnt_ns_info)
/* Get next namespace. */
#define NS_MNT_GET_NEXT _IOR(NSIO, 11, struct mnt_ns_info)
/* Get previous namespace. */
#define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info)
#endif /* __LINUX_NSFS_H */