forked from Minki/linux
0cbee99269
Pull user namespace updates from Eric Biederman: "Long ago and far away when user namespaces where young it was realized that allowing fresh mounts of proc and sysfs with only user namespace permissions could violate the basic rule that only root gets to decide if proc or sysfs should be mounted at all. Some hacks were put in place to reduce the worst of the damage could be done, and the common sense rule was adopted that fresh mounts of proc and sysfs should allow no more than bind mounts of proc and sysfs. Unfortunately that rule has not been fully enforced. There are two kinds of gaps in that enforcement. Only filesystems mounted on empty directories of proc and sysfs should be ignored but the test for empty directories was insufficient. So in my tree directories on proc, sysctl and sysfs that will always be empty are created specially. Every other technique is imperfect as an ordinary directory can have entries added even after a readdir returns and shows that the directory is empty. Special creation of directories for mount points makes the code in the kernel a smidge clearer about it's purpose. I asked container developers from the various container projects to help test this and no holes were found in the set of mount points on proc and sysfs that are created specially. This set of changes also starts enforcing the mount flags of fresh mounts of proc and sysfs are consistent with the existing mount of proc and sysfs. I expected this to be the boring part of the work but unfortunately unprivileged userspace winds up mounting fresh copies of proc and sysfs with noexec and nosuid clear when root set those flags on the previous mount of proc and sysfs. So for now only the atime, read-only and nodev attributes which userspace happens to keep consistent are enforced. Dealing with the noexec and nosuid attributes remains for another time. This set of changes also addresses an issue with how open file descriptors from /proc/<pid>/ns/* are displayed. Recently readlink of /proc/<pid>/fd has been triggering a WARN_ON that has not been meaningful since it was added (as all of the code in the kernel was converted) and is not now actively wrong. There is also a short list of issues that have not been fixed yet that I will mention briefly. It is possible to rename a directory from below to above a bind mount. At which point any directory pointers below the renamed directory can be walked up to the root directory of the filesystem. With user namespaces enabled a bind mount of the bind mount can be created allowing the user to pick a directory whose children they can rename to outside of the bind mount. This is challenging to fix and doubly so because all obvious solutions must touch code that is in the performance part of pathname resolution. As mentioned above there is also a question of how to ensure that developers by accident or with purpose do not introduce exectuable files on sysfs and proc and in doing so introduce security regressions in the current userspace that will not be immediately obvious and as such are likely to require breaking userspace in painful ways once they are recognized" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: vfs: Remove incorrect debugging WARN in prepend_path mnt: Update fs_fully_visible to test for permanently empty directories sysfs: Create mountpoints with sysfs_create_mount_point sysfs: Add support for permanently empty directories to serve as mount points. kernfs: Add support for always empty directories. proc: Allow creating permanently empty directories that serve as mount points sysctl: Allow creating permanently empty directories that serve as mountpoints. fs: Add helper functions for permanently empty directories. vfs: Ignore unlocked mounts in fs_fully_visible mnt: Modify fs_fully_visible to deal with locked ro nodev and atime mnt: Refactor the logic for mounting sysfs and proc in a user namespace
176 lines
4.6 KiB
C
176 lines
4.6 KiB
C
/* -*- mode: c; c-basic-offset: 8; -*-
|
|
* vim: noexpandtab sw=8 ts=8 sts=0:
|
|
*
|
|
* mount.c - operations for initializing and mounting configfs.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public
|
|
* License along with this program; if not, write to the
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
* Boston, MA 021110-1307, USA.
|
|
*
|
|
* Based on sysfs:
|
|
* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
|
|
*
|
|
* configfs Copyright (C) 2005 Oracle. All rights reserved.
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/module.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/init.h>
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/configfs.h>
|
|
#include "configfs_internal.h"
|
|
|
|
/* Random magic number */
|
|
#define CONFIGFS_MAGIC 0x62656570
|
|
|
|
static struct vfsmount *configfs_mount = NULL;
|
|
struct kmem_cache *configfs_dir_cachep;
|
|
static int configfs_mnt_count = 0;
|
|
|
|
static const struct super_operations configfs_ops = {
|
|
.statfs = simple_statfs,
|
|
.drop_inode = generic_delete_inode,
|
|
};
|
|
|
|
static struct config_group configfs_root_group = {
|
|
.cg_item = {
|
|
.ci_namebuf = "root",
|
|
.ci_name = configfs_root_group.cg_item.ci_namebuf,
|
|
},
|
|
};
|
|
|
|
int configfs_is_root(struct config_item *item)
|
|
{
|
|
return item == &configfs_root_group.cg_item;
|
|
}
|
|
|
|
static struct configfs_dirent configfs_root = {
|
|
.s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling),
|
|
.s_children = LIST_HEAD_INIT(configfs_root.s_children),
|
|
.s_element = &configfs_root_group.cg_item,
|
|
.s_type = CONFIGFS_ROOT,
|
|
.s_iattr = NULL,
|
|
};
|
|
|
|
static int configfs_fill_super(struct super_block *sb, void *data, int silent)
|
|
{
|
|
struct inode *inode;
|
|
struct dentry *root;
|
|
|
|
sb->s_blocksize = PAGE_CACHE_SIZE;
|
|
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
|
|
sb->s_magic = CONFIGFS_MAGIC;
|
|
sb->s_op = &configfs_ops;
|
|
sb->s_time_gran = 1;
|
|
|
|
inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
|
|
&configfs_root, sb);
|
|
if (inode) {
|
|
inode->i_op = &configfs_root_inode_operations;
|
|
inode->i_fop = &configfs_dir_operations;
|
|
/* directory inodes start off with i_nlink == 2 (for "." entry) */
|
|
inc_nlink(inode);
|
|
} else {
|
|
pr_debug("could not get root inode\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
root = d_make_root(inode);
|
|
if (!root) {
|
|
pr_debug("%s: could not get root dentry!\n",__func__);
|
|
return -ENOMEM;
|
|
}
|
|
config_group_init(&configfs_root_group);
|
|
configfs_root_group.cg_item.ci_dentry = root;
|
|
root->d_fsdata = &configfs_root;
|
|
sb->s_root = root;
|
|
sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
|
|
return 0;
|
|
}
|
|
|
|
static struct dentry *configfs_do_mount(struct file_system_type *fs_type,
|
|
int flags, const char *dev_name, void *data)
|
|
{
|
|
return mount_single(fs_type, flags, data, configfs_fill_super);
|
|
}
|
|
|
|
static struct file_system_type configfs_fs_type = {
|
|
.owner = THIS_MODULE,
|
|
.name = "configfs",
|
|
.mount = configfs_do_mount,
|
|
.kill_sb = kill_litter_super,
|
|
};
|
|
MODULE_ALIAS_FS("configfs");
|
|
|
|
struct dentry *configfs_pin_fs(void)
|
|
{
|
|
int err = simple_pin_fs(&configfs_fs_type, &configfs_mount,
|
|
&configfs_mnt_count);
|
|
return err ? ERR_PTR(err) : configfs_mount->mnt_root;
|
|
}
|
|
|
|
void configfs_release_fs(void)
|
|
{
|
|
simple_release_fs(&configfs_mount, &configfs_mnt_count);
|
|
}
|
|
|
|
|
|
static int __init configfs_init(void)
|
|
{
|
|
int err = -ENOMEM;
|
|
|
|
configfs_dir_cachep = kmem_cache_create("configfs_dir_cache",
|
|
sizeof(struct configfs_dirent),
|
|
0, 0, NULL);
|
|
if (!configfs_dir_cachep)
|
|
goto out;
|
|
|
|
err = sysfs_create_mount_point(kernel_kobj, "config");
|
|
if (err)
|
|
goto out2;
|
|
|
|
err = register_filesystem(&configfs_fs_type);
|
|
if (err)
|
|
goto out3;
|
|
|
|
return 0;
|
|
out3:
|
|
pr_err("Unable to register filesystem!\n");
|
|
sysfs_remove_mount_point(kernel_kobj, "config");
|
|
out2:
|
|
kmem_cache_destroy(configfs_dir_cachep);
|
|
configfs_dir_cachep = NULL;
|
|
out:
|
|
return err;
|
|
}
|
|
|
|
static void __exit configfs_exit(void)
|
|
{
|
|
unregister_filesystem(&configfs_fs_type);
|
|
sysfs_remove_mount_point(kernel_kobj, "config");
|
|
kmem_cache_destroy(configfs_dir_cachep);
|
|
configfs_dir_cachep = NULL;
|
|
}
|
|
|
|
MODULE_AUTHOR("Oracle");
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_VERSION("0.0.2");
|
|
MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
|
|
|
|
core_initcall(configfs_init);
|
|
module_exit(configfs_exit);
|