linux/fs/proc/proc_sysctl.c
Linus Torvalds 14986a34e1 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull namespace updates from Eric Biederman:
 "This set of changes is a number of smaller things that have been
  overlooked in other development cycles focused on more fundamental
  change. The devpts changes are small things that were a distraction
  until we managed to kill off DEVPTS_MULTPLE_INSTANCES. There is an
  trivial regression fix to autofs for the unprivileged mount changes
  that went in last cycle. A pair of ioctls has been added by Andrey
  Vagin making it is possible to discover the relationships between
  namespaces when referring to them through file descriptors.

  The big user visible change is starting to add simple resource limits
  to catch programs that misbehave. With namespaces in general and user
  namespaces in particular allowing users to use more kinds of
  resources, it has become important to have something to limit errant
  programs. Because the purpose of these limits is to catch errant
  programs the code needs to be inexpensive to use as it always on, and
  the default limits need to be high enough that well behaved programs
  on well behaved systems don't encounter them.

  To this end, after some review I have implemented per user per user
  namespace limits, and use them to limit the number of namespaces. The
  limits being per user mean that one user can not exhause the limits of
  another user. The limits being per user namespace allow contexts where
  the limit is 0 and security conscious folks can remove from their
  threat anlysis the code used to manage namespaces (as they have
  historically done as it root only). At the same time the limits being
  per user namespace allow other parts of the system to use namespaces.

  Namespaces are increasingly being used in application sand boxing
  scenarios so an all or nothing disable for the entire system for the
  security conscious folks makes increasing use of these sandboxes
  impossible.

  There is also added a limit on the maximum number of mounts present in
  a single mount namespace. It is nontrivial to guess what a reasonable
  system wide limit on the number of mount structure in the kernel would
  be, especially as it various based on how a system is using
  containers. A limit on the number of mounts in a mount namespace
  however is much easier to understand and set. In most cases in
  practice only about 1000 mounts are used. Given that some autofs
  scenarious have the potential to be 30,000 to 50,000 mounts I have set
  the default limit for the number of mounts at 100,000 which is well
  above every known set of users but low enough that the mount hash
  tables don't degrade unreaonsably.

  These limits are a start. I expect this estabilishes a pattern that
  other limits for resources that namespaces use will follow. There has
  been interest in making inotify event limits per user per user
  namespace as well as interest expressed in making details about what
  is going on in the kernel more visible"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (28 commits)
  autofs:  Fix automounts by using current_real_cred()->uid
  mnt: Add a per mount namespace limit on the number of mounts
  netns: move {inc,dec}_net_namespaces into #ifdef
  nsfs: Simplify __ns_get_path
  tools/testing: add a test to check nsfs ioctl-s
  nsfs: add ioctl to get a parent namespace
  nsfs: add ioctl to get an owning user namespace for ns file descriptor
  kernel: add a helper to get an owning user namespace for a namespace
  devpts: Change the owner of /dev/pts/ptmx to the mounter of /dev/pts
  devpts: Remove sync_filesystems
  devpts: Make devpts_kill_sb safe if fsi is NULL
  devpts: Simplify devpts_mount by using mount_nodev
  devpts: Move the creation of /dev/pts/ptmx into fill_super
  devpts: Move parse_mount_options into fill_super
  userns: When the per user per user namespace limit is reached return ENOSPC
  userns; Document per user per user namespace limits.
  mntns: Add a limit on the number of mount namespaces.
  netns: Add a limit on the number of net namespaces
  cgroupns: Add a limit on the number of cgroup namespaces
  ipcns: Add a  limit on the number of ipc namespaces
  ...
2016-10-06 09:52:23 -07:00

1626 lines
39 KiB
C

/*
* /proc/sys support
*/
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/printk.h>
#include <linux/security.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/mm.h>
#include <linux/module.h>
#include "internal.h"
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
/* Support for permanently empty directories */
struct ctl_table sysctl_mount_point[] = {
{ }
};
static bool is_empty_dir(struct ctl_table_header *head)
{
return head->ctl_table[0].child == sysctl_mount_point;
}
static void set_empty_dir(struct ctl_dir *dir)
{
dir->header.ctl_table[0].child = sysctl_mount_point;
}
static void clear_empty_dir(struct ctl_dir *dir)
{
dir->header.ctl_table[0].child = NULL;
}
void proc_sys_poll_notify(struct ctl_table_poll *poll)
{
if (!poll)
return;
atomic_inc(&poll->event);
wake_up_interruptible(&poll->wait);
}
static struct ctl_table root_table[] = {
{
.procname = "",
.mode = S_IFDIR|S_IRUGO|S_IXUGO,
},
{ }
};
static struct ctl_table_root sysctl_table_root = {
.default_set.dir.header = {
{{.count = 1,
.nreg = 1,
.ctl_table = root_table }},
.ctl_table_arg = root_table,
.root = &sysctl_table_root,
.set = &sysctl_table_root.default_set,
},
};
static DEFINE_SPINLOCK(sysctl_lock);
static void drop_sysctl_table(struct ctl_table_header *header);
static int sysctl_follow_link(struct ctl_table_header **phead,
struct ctl_table **pentry);
static int insert_links(struct ctl_table_header *head);
static void put_links(struct ctl_table_header *header);
static void sysctl_print_dir(struct ctl_dir *dir)
{
if (dir->header.parent)
sysctl_print_dir(dir->header.parent);
pr_cont("%s/", dir->header.ctl_table[0].procname);
}
static int namecmp(const char *name1, int len1, const char *name2, int len2)
{
int minlen;
int cmp;
minlen = len1;
if (minlen > len2)
minlen = len2;
cmp = memcmp(name1, name2, minlen);
if (cmp == 0)
cmp = len1 - len2;
return cmp;
}
/* Called under sysctl_lock */
static struct ctl_table *find_entry(struct ctl_table_header **phead,
struct ctl_dir *dir, const char *name, int namelen)
{
struct ctl_table_header *head;
struct ctl_table *entry;
struct rb_node *node = dir->root.rb_node;
while (node)
{
struct ctl_node *ctl_node;
const char *procname;
int cmp;
ctl_node = rb_entry(node, struct ctl_node, node);
head = ctl_node->header;
entry = &head->ctl_table[ctl_node - head->node];
procname = entry->procname;
cmp = namecmp(name, namelen, procname, strlen(procname));
if (cmp < 0)
node = node->rb_left;
else if (cmp > 0)
node = node->rb_right;
else {
*phead = head;
return entry;
}
}
return NULL;
}
static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
struct rb_node **p = &head->parent->root.rb_node;
struct rb_node *parent = NULL;
const char *name = entry->procname;
int namelen = strlen(name);
while (*p) {
struct ctl_table_header *parent_head;
struct ctl_table *parent_entry;
struct ctl_node *parent_node;
const char *parent_name;
int cmp;
parent = *p;
parent_node = rb_entry(parent, struct ctl_node, node);
parent_head = parent_node->header;
parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
parent_name = parent_entry->procname;
cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
if (cmp < 0)
p = &(*p)->rb_left;
else if (cmp > 0)
p = &(*p)->rb_right;
else {
pr_err("sysctl duplicate entry: ");
sysctl_print_dir(head->parent);
pr_cont("/%s\n", entry->procname);
return -EEXIST;
}
}
rb_link_node(node, parent, p);
rb_insert_color(node, &head->parent->root);
return 0;
}
static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
rb_erase(node, &head->parent->root);
}
static void init_header(struct ctl_table_header *head,
struct ctl_table_root *root, struct ctl_table_set *set,
struct ctl_node *node, struct ctl_table *table)
{
head->ctl_table = table;
head->ctl_table_arg = table;
head->used = 0;
head->count = 1;
head->nreg = 1;
head->unregistering = NULL;
head->root = root;
head->set = set;
head->parent = NULL;
head->node = node;
if (node) {
struct ctl_table *entry;
for (entry = table; entry->procname; entry++, node++)
node->header = head;
}
}
static void erase_header(struct ctl_table_header *head)
{
struct ctl_table *entry;
for (entry = head->ctl_table; entry->procname; entry++)
erase_entry(head, entry);
}
static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
{
struct ctl_table *entry;
int err;
/* Is this a permanently empty directory? */
if (is_empty_dir(&dir->header))
return -EROFS;
/* Am I creating a permanently empty directory? */
if (header->ctl_table == sysctl_mount_point) {
if (!RB_EMPTY_ROOT(&dir->root))
return -EINVAL;
set_empty_dir(dir);
}
dir->header.nreg++;
header->parent = dir;
err = insert_links(header);
if (err)
goto fail_links;
for (entry = header->ctl_table; entry->procname; entry++) {
err = insert_entry(header, entry);
if (err)
goto fail;
}
return 0;
fail:
erase_header(header);
put_links(header);
fail_links:
if (header->ctl_table == sysctl_mount_point)
clear_empty_dir(dir);
header->parent = NULL;
drop_sysctl_table(&dir->header);
return err;
}
/* called under sysctl_lock */
static int use_table(struct ctl_table_header *p)
{
if (unlikely(p->unregistering))
return 0;
p->used++;
return 1;
}
/* called under sysctl_lock */
static void unuse_table(struct ctl_table_header *p)
{
if (!--p->used)
if (unlikely(p->unregistering))
complete(p->unregistering);
}
/* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering(struct ctl_table_header *p)
{
/*
* if p->used is 0, nobody will ever touch that entry again;
* we'll eliminate all paths to it before dropping sysctl_lock
*/
if (unlikely(p->used)) {
struct completion wait;
init_completion(&wait);
p->unregistering = &wait;
spin_unlock(&sysctl_lock);
wait_for_completion(&wait);
spin_lock(&sysctl_lock);
} else {
/* anything non-NULL; we'll never dereference it */
p->unregistering = ERR_PTR(-EINVAL);
}
/*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
*/
erase_header(p);
}
static void sysctl_head_get(struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
head->count++;
spin_unlock(&sysctl_lock);
}
void sysctl_head_put(struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
}
static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
{
BUG_ON(!head);
spin_lock(&sysctl_lock);
if (!use_table(head))
head = ERR_PTR(-ENOENT);
spin_unlock(&sysctl_lock);
return head;
}
static void sysctl_head_finish(struct ctl_table_header *head)
{
if (!head)
return;
spin_lock(&sysctl_lock);
unuse_table(head);
spin_unlock(&sysctl_lock);
}
static struct ctl_table_set *
lookup_header_set(struct ctl_table_root *root)
{
struct ctl_table_set *set = &root->default_set;
if (root->lookup)
set = root->lookup(root);
return set;
}
static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
struct ctl_dir *dir,
const char *name, int namelen)
{
struct ctl_table_header *head;
struct ctl_table *entry;
spin_lock(&sysctl_lock);
entry = find_entry(&head, dir, name, namelen);
if (entry && use_table(head))
*phead = head;
else
entry = NULL;
spin_unlock(&sysctl_lock);
return entry;
}
static struct ctl_node *first_usable_entry(struct rb_node *node)
{
struct ctl_node *ctl_node;
for (;node; node = rb_next(node)) {
ctl_node = rb_entry(node, struct ctl_node, node);
if (use_table(ctl_node->header))
return ctl_node;
}
return NULL;
}
static void first_entry(struct ctl_dir *dir,
struct ctl_table_header **phead, struct ctl_table **pentry)
{
struct ctl_table_header *head = NULL;
struct ctl_table *entry = NULL;
struct ctl_node *ctl_node;
spin_lock(&sysctl_lock);
ctl_node = first_usable_entry(rb_first(&dir->root));
spin_unlock(&sysctl_lock);
if (ctl_node) {
head = ctl_node->header;
entry = &head->ctl_table[ctl_node - head->node];
}
*phead = head;
*pentry = entry;
}
static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
{
struct ctl_table_header *head = *phead;
struct ctl_table *entry = *pentry;
struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
spin_lock(&sysctl_lock);
unuse_table(head);
ctl_node = first_usable_entry(rb_next(&ctl_node->node));
spin_unlock(&sysctl_lock);
head = NULL;
if (ctl_node) {
head = ctl_node->header;
entry = &head->ctl_table[ctl_node - head->node];
}
*phead = head;
*pentry = entry;
}
void register_sysctl_root(struct ctl_table_root *root)
{
}
/*
* sysctl_perm does NOT grant the superuser all rights automatically, because
* some sysctl variables are readonly even to root.
*/
static int test_perm(int mode, int op)
{
if (uid_eq(current_euid(), GLOBAL_ROOT_UID))
mode >>= 6;
else if (in_egroup_p(GLOBAL_ROOT_GID))
mode >>= 3;
if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
return 0;
return -EACCES;
}
static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
{
struct ctl_table_root *root = head->root;
int mode;
if (root->permissions)
mode = root->permissions(head, table);
else
mode = table->mode;
return test_perm(mode, op);
}
static struct inode *proc_sys_make_inode(struct super_block *sb,
struct ctl_table_header *head, struct ctl_table *table)
{
struct ctl_table_root *root = head->root;
struct inode *inode;
struct proc_inode *ei;
inode = new_inode(sb);
if (!inode)
goto out;
inode->i_ino = get_next_ino();
sysctl_head_get(head);
ei = PROC_I(inode);
ei->sysctl = head;
ei->sysctl_entry = table;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
inode->i_op = &proc_sys_inode_operations;
inode->i_fop = &proc_sys_file_operations;
} else {
inode->i_mode |= S_IFDIR;
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
if (is_empty_dir(head))
make_empty_dir_inode(inode);
}
if (root->set_ownership)
root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
out:
return inode;
}
static struct ctl_table_header *grab_header(struct inode *inode)
{
struct ctl_table_header *head = PROC_I(inode)->sysctl;
if (!head)
head = &sysctl_table_root.default_set.dir.header;
return sysctl_head_grab(head);
}
static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct ctl_table_header *head = grab_header(dir);
struct ctl_table_header *h = NULL;
const struct qstr *name = &dentry->d_name;
struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
struct ctl_dir *ctl_dir;
int ret;
if (IS_ERR(head))
return ERR_CAST(head);
ctl_dir = container_of(head, struct ctl_dir, header);
p = lookup_entry(&h, ctl_dir, name->name, name->len);
if (!p)
goto out;
if (S_ISLNK(p->mode)) {
ret = sysctl_follow_link(&h, &p);
err = ERR_PTR(ret);
if (ret)
goto out;
}
err = ERR_PTR(-ENOMEM);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
if (!inode)
goto out;
err = NULL;
d_set_d_op(dentry, &proc_sys_dentry_operations);
d_add(dentry, inode);
out:
if (h)
sysctl_head_finish(h);
sysctl_head_finish(head);
return err;
}
static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
size_t count, loff_t *ppos, int write)
{
struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
ssize_t error;
size_t res;
if (IS_ERR(head))
return PTR_ERR(head);
/*
* At this point we know that the sysctl was not unregistered
* and won't be until we finish.
*/
error = -EPERM;
if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
goto out;
/* if that can happen at all, it should be -EINVAL, not -EISDIR */
error = -EINVAL;
if (!table->proc_handler)
goto out;
/* careful: calling conventions are nasty here */
res = count;
error = table->proc_handler(table, write, buf, &res, ppos);
if (!error)
error = res;
out:
sysctl_head_finish(head);
return error;
}
static ssize_t proc_sys_read(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0);
}
static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
size_t count, loff_t *ppos)
{
return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
}
static int proc_sys_open(struct inode *inode, struct file *filp)
{
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
/* sysctl was unregistered */
if (IS_ERR(head))
return PTR_ERR(head);
if (table->poll)
filp->private_data = proc_sys_poll_event(table->poll);
sysctl_head_finish(head);
return 0;
}
static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
{
struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
unsigned int ret = DEFAULT_POLLMASK;
unsigned long event;
/* sysctl was unregistered */
if (IS_ERR(head))
return POLLERR | POLLHUP;
if (!table->proc_handler)
goto out;
if (!table->poll)
goto out;
event = (unsigned long)filp->private_data;
poll_wait(filp, &table->poll->wait, wait);
if (event != atomic_read(&table->poll->event)) {
filp->private_data = proc_sys_poll_event(table->poll);
ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
}
out:
sysctl_head_finish(head);
return ret;
}
static bool proc_sys_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
struct ctl_table *table)
{
struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
struct qstr qname;
ino_t ino = 0;
unsigned type = DT_UNKNOWN;
qname.name = table->procname;
qname.len = strlen(table->procname);
qname.hash = full_name_hash(dir, qname.name, qname.len);
child = d_lookup(dir, &qname);
if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
if (IS_ERR(child))
return false;
if (d_in_lookup(child)) {
inode = proc_sys_make_inode(dir->d_sb, head, table);
if (!inode) {
d_lookup_done(child);
dput(child);
return false;
}
d_set_d_op(child, &proc_sys_dentry_operations);
d_add(child, inode);
}
}
inode = d_inode(child);
ino = inode->i_ino;
type = inode->i_mode >> 12;
dput(child);
return dir_emit(ctx, qname.name, qname.len, ino, type);
}
static bool proc_sys_link_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
struct ctl_table *table)
{
bool ret = true;
head = sysctl_head_grab(head);
if (S_ISLNK(table->mode)) {
/* It is not an error if we can not follow the link ignore it */
int err = sysctl_follow_link(&head, &table);
if (err)
goto out;
}
ret = proc_sys_fill_cache(file, ctx, head, table);
out:
sysctl_head_finish(head);
return ret;
}
static int scan(struct ctl_table_header *head, struct ctl_table *table,
unsigned long *pos, struct file *file,
struct dir_context *ctx)
{
bool res;
if ((*pos)++ < ctx->pos)
return true;
if (unlikely(S_ISLNK(table->mode)))
res = proc_sys_link_fill_cache(file, ctx, head, table);
else
res = proc_sys_fill_cache(file, ctx, head, table);
if (res)
ctx->pos = *pos;
return res;
}
static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
{
struct ctl_table_header *head = grab_header(file_inode(file));
struct ctl_table_header *h = NULL;
struct ctl_table *entry;
struct ctl_dir *ctl_dir;
unsigned long pos;
if (IS_ERR(head))
return PTR_ERR(head);
ctl_dir = container_of(head, struct ctl_dir, header);
if (!dir_emit_dots(file, ctx))
return 0;
pos = 2;
for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
if (!scan(h, entry, &pos, file, ctx)) {
sysctl_head_finish(h);
break;
}
}
sysctl_head_finish(head);
return 0;
}
static int proc_sys_permission(struct inode *inode, int mask)
{
/*
* sysctl entries that are not writeable,
* are _NOT_ writeable, capabilities or not.
*/
struct ctl_table_header *head;
struct ctl_table *table;
int error;
/* Executable files are not allowed under /proc/sys/ */
if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
return -EACCES;
head = grab_header(inode);
if (IS_ERR(head))
return PTR_ERR(head);
table = PROC_I(inode)->sysctl_entry;
if (!table) /* global root - r-xr-xr-x */
error = mask & MAY_WRITE ? -EACCES : 0;
else /* Use the permissions on the sysctl table entry */
error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
sysctl_head_finish(head);
return error;
}
static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int error;
if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
return -EPERM;
error = inode_change_ok(inode, attr);
if (error)
return error;
setattr_copy(inode, attr);
mark_inode_dirty(inode);
return 0;
}
static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = d_inode(dentry);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
if (IS_ERR(head))
return PTR_ERR(head);
generic_fillattr(inode, stat);
if (table)
stat->mode = (stat->mode & S_IFMT) | table->mode;
sysctl_head_finish(head);
return 0;
}
static const struct file_operations proc_sys_file_operations = {
.open = proc_sys_open,
.poll = proc_sys_poll,
.read = proc_sys_read,
.write = proc_sys_write,
.llseek = default_llseek,
};
static const struct file_operations proc_sys_dir_file_operations = {
.read = generic_read_dir,
.iterate_shared = proc_sys_readdir,
.llseek = generic_file_llseek,
};
static const struct inode_operations proc_sys_inode_operations = {
.permission = proc_sys_permission,
.setattr = proc_sys_setattr,
.getattr = proc_sys_getattr,
};
static const struct inode_operations proc_sys_dir_operations = {
.lookup = proc_sys_lookup,
.permission = proc_sys_permission,
.setattr = proc_sys_setattr,
.getattr = proc_sys_getattr,
};
static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
return !PROC_I(d_inode(dentry))->sysctl->unregistering;
}
static int proc_sys_delete(const struct dentry *dentry)
{
return !!PROC_I(d_inode(dentry))->sysctl->unregistering;
}
static int sysctl_is_seen(struct ctl_table_header *p)
{
struct ctl_table_set *set = p->set;
int res;
spin_lock(&sysctl_lock);
if (p->unregistering)
res = 0;
else if (!set->is_seen)
res = 1;
else
res = set->is_seen(set);
spin_unlock(&sysctl_lock);
return res;
}
static int proc_sys_compare(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct ctl_table_header *head;
struct inode *inode;
/* Although proc doesn't have negative dentries, rcu-walk means
* that inode here can be NULL */
/* AV: can it, indeed? */
inode = d_inode_rcu(dentry);
if (!inode)
return 1;
if (name->len != len)
return 1;
if (memcmp(name->name, str, len))
return 1;
head = rcu_dereference(PROC_I(inode)->sysctl);
return !head || !sysctl_is_seen(head);
}
static const struct dentry_operations proc_sys_dentry_operations = {
.d_revalidate = proc_sys_revalidate,
.d_delete = proc_sys_delete,
.d_compare = proc_sys_compare,
};
static struct ctl_dir *find_subdir(struct ctl_dir *dir,
const char *name, int namelen)
{
struct ctl_table_header *head;
struct ctl_table *entry;
entry = find_entry(&head, dir, name, namelen);
if (!entry)
return ERR_PTR(-ENOENT);
if (!S_ISDIR(entry->mode))
return ERR_PTR(-ENOTDIR);
return container_of(head, struct ctl_dir, header);
}
static struct ctl_dir *new_dir(struct ctl_table_set *set,
const char *name, int namelen)
{
struct ctl_table *table;
struct ctl_dir *new;
struct ctl_node *node;
char *new_name;
new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
sizeof(struct ctl_table)*2 + namelen + 1,
GFP_KERNEL);
if (!new)
return NULL;
node = (struct ctl_node *)(new + 1);
table = (struct ctl_table *)(node + 1);
new_name = (char *)(table + 2);
memcpy(new_name, name, namelen);
new_name[namelen] = '\0';
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
init_header(&new->header, set->dir.header.root, set, node, table);
return new;
}
/**
* get_subdir - find or create a subdir with the specified name.
* @dir: Directory to create the subdirectory in
* @name: The name of the subdirectory to find or create
* @namelen: The length of name
*
* Takes a directory with an elevated reference count so we know that
* if we drop the lock the directory will not go away. Upon success
* the reference is moved from @dir to the returned subdirectory.
* Upon error an error code is returned and the reference on @dir is
* simply dropped.
*/
static struct ctl_dir *get_subdir(struct ctl_dir *dir,
const char *name, int namelen)
{
struct ctl_table_set *set = dir->header.set;
struct ctl_dir *subdir, *new = NULL;
int err;
spin_lock(&sysctl_lock);
subdir = find_subdir(dir, name, namelen);
if (!IS_ERR(subdir))
goto found;
if (PTR_ERR(subdir) != -ENOENT)
goto failed;
spin_unlock(&sysctl_lock);
new = new_dir(set, name, namelen);
spin_lock(&sysctl_lock);
subdir = ERR_PTR(-ENOMEM);
if (!new)
goto failed;
/* Was the subdir added while we dropped the lock? */
subdir = find_subdir(dir, name, namelen);
if (!IS_ERR(subdir))
goto found;
if (PTR_ERR(subdir) != -ENOENT)
goto failed;
/* Nope. Use the our freshly made directory entry. */
err = insert_header(dir, &new->header);
subdir = ERR_PTR(err);
if (err)
goto failed;
subdir = new;
found:
subdir->header.nreg++;
failed:
if (IS_ERR(subdir)) {
pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
pr_cont("/%*.*s %ld\n",
namelen, namelen, name, PTR_ERR(subdir));
}
drop_sysctl_table(&dir->header);
if (new)
drop_sysctl_table(&new->header);
spin_unlock(&sysctl_lock);
return subdir;
}
static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
{
struct ctl_dir *parent;
const char *procname;
if (!dir->header.parent)
return &set->dir;
parent = xlate_dir(set, dir->header.parent);
if (IS_ERR(parent))
return parent;
procname = dir->header.ctl_table[0].procname;
return find_subdir(parent, procname, strlen(procname));
}
static int sysctl_follow_link(struct ctl_table_header **phead,
struct ctl_table **pentry)
{
struct ctl_table_header *head;
struct ctl_table_root *root;
struct ctl_table_set *set;
struct ctl_table *entry;
struct ctl_dir *dir;
int ret;
ret = 0;
spin_lock(&sysctl_lock);
root = (*pentry)->data;
set = lookup_header_set(root);
dir = xlate_dir(set, (*phead)->parent);
if (IS_ERR(dir))
ret = PTR_ERR(dir);
else {
const char *procname = (*pentry)->procname;
head = NULL;
entry = find_entry(&head, dir, procname, strlen(procname));
ret = -ENOENT;
if (entry && use_table(head)) {
unuse_table(*phead);
*phead = head;
*pentry = entry;
ret = 0;
}
}
spin_unlock(&sysctl_lock);
return ret;
}
static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
{
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
pr_err("sysctl table check failed: %s/%s %pV\n",
path, table->procname, &vaf);
va_end(args);
return -EINVAL;
}
static int sysctl_check_table(const char *path, struct ctl_table *table)
{
int err = 0;
for (; table->procname; table++) {
if (table->child)
err = sysctl_err(path, table, "Not a file");
if ((table->proc_handler == proc_dostring) ||
(table->proc_handler == proc_dointvec) ||
(table->proc_handler == proc_dointvec_minmax) ||
(table->proc_handler == proc_dointvec_jiffies) ||
(table->proc_handler == proc_dointvec_userhz_jiffies) ||
(table->proc_handler == proc_dointvec_ms_jiffies) ||
(table->proc_handler == proc_doulongvec_minmax) ||
(table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
if (!table->data)
err = sysctl_err(path, table, "No data");
if (!table->maxlen)
err = sysctl_err(path, table, "No maxlen");
}
if (!table->proc_handler)
err = sysctl_err(path, table, "No proc_handler");
if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
err = sysctl_err(path, table, "bogus .mode 0%o",
table->mode);
}
return err;
}
static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
struct ctl_table_root *link_root)
{
struct ctl_table *link_table, *entry, *link;
struct ctl_table_header *links;
struct ctl_node *node;
char *link_name;
int nr_entries, name_bytes;
name_bytes = 0;
nr_entries = 0;
for (entry = table; entry->procname; entry++) {
nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
links = kzalloc(sizeof(struct ctl_table_header) +
sizeof(struct ctl_node)*nr_entries +
sizeof(struct ctl_table)*(nr_entries + 1) +
name_bytes,
GFP_KERNEL);
if (!links)
return NULL;
node = (struct ctl_node *)(links + 1);
link_table = (struct ctl_table *)(node + nr_entries);
link_name = (char *)&link_table[nr_entries + 1];
for (link = link_table, entry = table; entry->procname; link++, entry++) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
link->data = link_root;
link_name += len;
}
init_header(links, dir->header.root, dir->header.set, node, link_table);
links->nreg = nr_entries;
return links;
}
static bool get_links(struct ctl_dir *dir,
struct ctl_table *table, struct ctl_table_root *link_root)
{
struct ctl_table_header *head;
struct ctl_table *entry, *link;
/* Are there links available for every entry in table? */
for (entry = table; entry->procname; entry++) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
if (!link)
return false;
if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
continue;
if (S_ISLNK(link->mode) && (link->data == link_root))
continue;
return false;
}
/* The checks passed. Increase the registration count on the links */
for (entry = table; entry->procname; entry++) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
head->nreg++;
}
return true;
}
static int insert_links(struct ctl_table_header *head)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
struct ctl_dir *core_parent = NULL;
struct ctl_table_header *links;
int err;
if (head->set == root_set)
return 0;
core_parent = xlate_dir(root_set, head->parent);
if (IS_ERR(core_parent))
return 0;
if (get_links(core_parent, head->ctl_table, head->root))
return 0;
core_parent->header.nreg++;
spin_unlock(&sysctl_lock);
links = new_links(core_parent, head->ctl_table, head->root);
spin_lock(&sysctl_lock);
err = -ENOMEM;
if (!links)
goto out;
err = 0;
if (get_links(core_parent, head->ctl_table, head->root)) {
kfree(links);
goto out;
}
err = insert_header(core_parent, links);
if (err)
kfree(links);
out:
drop_sysctl_table(&core_parent->header);
return err;
}
/**
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
* @table: the top-level table structure
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* The members of the &struct ctl_table structure are used as follows:
*
* procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
* enter a sysctl file
*
* data - a pointer to data for use by proc_handler
*
* maxlen - the maximum size in bytes of the data
*
* mode - the file permissions for the /proc/sys file
*
* child - must be %NULL.
*
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
*
* Leaf nodes in the sysctl tree will be represented by a single file
* under /proc; non-leaf nodes will be represented by directories.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
*
* proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
* proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
* proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
*
* It is the handler's job to read the input buffer from user memory
* and process it. The handler should return 0 on success.
*
* This routine returns %NULL on a failure to register, and a pointer
* to the table header on success.
*/
struct ctl_table_header *__register_sysctl_table(
struct ctl_table_set *set,
const char *path, struct ctl_table *table)
{
struct ctl_table_root *root = set->dir.header.root;
struct ctl_table_header *header;
const char *name, *nextname;
struct ctl_dir *dir;
struct ctl_table *entry;
struct ctl_node *node;
int nr_entries = 0;
for (entry = table; entry->procname; entry++)
nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
if (!header)
return NULL;
node = (struct ctl_node *)(header + 1);
init_header(header, root, set, node, table);
if (sysctl_check_table(path, table))
goto fail;
spin_lock(&sysctl_lock);
dir = &set->dir;
/* Reference moved down the diretory tree get_subdir */
dir->header.nreg++;
spin_unlock(&sysctl_lock);
/* Find the directory for the ctl_table */
for (name = path; name; name = nextname) {
int namelen;
nextname = strchr(name, '/');
if (nextname) {
namelen = nextname - name;
nextname++;
} else {
namelen = strlen(name);
}
if (namelen == 0)
continue;
dir = get_subdir(dir, name, namelen);
if (IS_ERR(dir))
goto fail;
}
spin_lock(&sysctl_lock);
if (insert_header(dir, header))
goto fail_put_dir_locked;
drop_sysctl_table(&dir->header);
spin_unlock(&sysctl_lock);
return header;
fail_put_dir_locked:
drop_sysctl_table(&dir->header);
spin_unlock(&sysctl_lock);
fail:
kfree(header);
dump_stack();
return NULL;
}
/**
* register_sysctl - register a sysctl table
* @path: The path to the directory the sysctl table is in.
* @table: the table structure
*
* Register a sysctl table. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See __register_sysctl_table for more details.
*/
struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
{
return __register_sysctl_table(&sysctl_table_root.default_set,
path, table);
}
EXPORT_SYMBOL(register_sysctl);
static char *append_path(const char *path, char *pos, const char *name)
{
int namelen;
namelen = strlen(name);
if (((pos - path) + namelen + 2) >= PATH_MAX)
return NULL;
memcpy(pos, name, namelen);
pos[namelen] = '/';
pos[namelen + 1] = '\0';
pos += namelen + 1;
return pos;
}
static int count_subheaders(struct ctl_table *table)
{
int has_files = 0;
int nr_subheaders = 0;
struct ctl_table *entry;
/* special case: no directory and empty directory */
if (!table || !table->procname)
return 1;
for (entry = table; entry->procname; entry++) {
if (entry->child)
nr_subheaders += count_subheaders(entry->child);
else
has_files = 1;
}
return nr_subheaders + has_files;
}
static int register_leaf_sysctl_tables(const char *path, char *pos,
struct ctl_table_header ***subheader, struct ctl_table_set *set,
struct ctl_table *table)
{
struct ctl_table *ctl_table_arg = NULL;
struct ctl_table *entry, *files;
int nr_files = 0;
int nr_dirs = 0;
int err = -ENOMEM;
for (entry = table; entry->procname; entry++) {
if (entry->child)
nr_dirs++;
else
nr_files++;
}
files = table;
/* If there are mixed files and directories we need a new table */
if (nr_dirs && nr_files) {
struct ctl_table *new;
files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
GFP_KERNEL);
if (!files)
goto out;
ctl_table_arg = files;
for (new = files, entry = table; entry->procname; entry++) {
if (entry->child)
continue;
*new = *entry;
new++;
}
}
/* Register everything except a directory full of subdirectories */
if (nr_files || !nr_dirs) {
struct ctl_table_header *header;
header = __register_sysctl_table(set, path, files);
if (!header) {
kfree(ctl_table_arg);
goto out;
}
/* Remember if we need to free the file table */
header->ctl_table_arg = ctl_table_arg;
**subheader = header;
(*subheader)++;
}
/* Recurse into the subdirectories. */
for (entry = table; entry->procname; entry++) {
char *child_pos;
if (!entry->child)
continue;
err = -ENAMETOOLONG;
child_pos = append_path(path, pos, entry->procname);
if (!child_pos)
goto out;
err = register_leaf_sysctl_tables(path, child_pos, subheader,
set, entry->child);
pos[0] = '\0';
if (err)
goto out;
}
err = 0;
out:
/* On failure our caller will unregister all registered subheaders */
return err;
}
/**
* __register_sysctl_paths - register a sysctl table hierarchy
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
* @table: the top-level table structure
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See __register_sysctl_table for more details.
*/
struct ctl_table_header *__register_sysctl_paths(
struct ctl_table_set *set,
const struct ctl_path *path, struct ctl_table *table)
{
struct ctl_table *ctl_table_arg = table;
int nr_subheaders = count_subheaders(table);
struct ctl_table_header *header = NULL, **subheaders, **subheader;
const struct ctl_path *component;
char *new_path, *pos;
pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
if (!new_path)
return NULL;
pos[0] = '\0';
for (component = path; component->procname; component++) {
pos = append_path(new_path, pos, component->procname);
if (!pos)
goto out;
}
while (table->procname && table->child && !table[1].procname) {
pos = append_path(new_path, pos, table->procname);
if (!pos)
goto out;
table = table->child;
}
if (nr_subheaders == 1) {
header = __register_sysctl_table(set, new_path, table);
if (header)
header->ctl_table_arg = ctl_table_arg;
} else {
header = kzalloc(sizeof(*header) +
sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
if (!header)
goto out;
subheaders = (struct ctl_table_header **) (header + 1);
subheader = subheaders;
header->ctl_table_arg = ctl_table_arg;
if (register_leaf_sysctl_tables(new_path, pos, &subheader,
set, table))
goto err_register_leaves;
}
out:
kfree(new_path);
return header;
err_register_leaves:
while (subheader > subheaders) {
struct ctl_table_header *subh = *(--subheader);
struct ctl_table *table = subh->ctl_table_arg;
unregister_sysctl_table(subh);
kfree(table);
}
kfree(header);
header = NULL;
goto out;
}
/**
* register_sysctl_table_path - register a sysctl table hierarchy
* @path: The path to the directory the sysctl table is in.
* @table: the top-level table structure
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See __register_sysctl_paths for more details.
*/
struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
struct ctl_table *table)
{
return __register_sysctl_paths(&sysctl_table_root.default_set,
path, table);
}
EXPORT_SYMBOL(register_sysctl_paths);
/**
* register_sysctl_table - register a sysctl table hierarchy
* @table: the top-level table structure
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See register_sysctl_paths for more details.
*/
struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
{
static const struct ctl_path null_path[] = { {} };
return register_sysctl_paths(null_path, table);
}
EXPORT_SYMBOL(register_sysctl_table);
static void put_links(struct ctl_table_header *header)
{
struct ctl_table_set *root_set = &sysctl_table_root.default_set;
struct ctl_table_root *root = header->root;
struct ctl_dir *parent = header->parent;
struct ctl_dir *core_parent;
struct ctl_table *entry;
if (header->set == root_set)
return;
core_parent = xlate_dir(root_set, parent);
if (IS_ERR(core_parent))
return;
for (entry = header->ctl_table; entry->procname; entry++) {
struct ctl_table_header *link_head;
struct ctl_table *link;
const char *name = entry->procname;
link = find_entry(&link_head, core_parent, name, strlen(name));
if (link &&
((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
(S_ISLNK(link->mode) && (link->data == root)))) {
drop_sysctl_table(link_head);
}
else {
pr_err("sysctl link missing during unregister: ");
sysctl_print_dir(parent);
pr_cont("/%s\n", name);
}
}
}
static void drop_sysctl_table(struct ctl_table_header *header)
{
struct ctl_dir *parent = header->parent;
if (--header->nreg)
return;
put_links(header);
start_unregistering(header);
if (!--header->count)
kfree_rcu(header, rcu);
if (parent)
drop_sysctl_table(&parent->header);
}
/**
* unregister_sysctl_table - unregister a sysctl table hierarchy
* @header: the header returned from register_sysctl_table
*
* Unregisters the sysctl table and all children. proc entries may not
* actually be removed until they are no longer used by anyone.
*/
void unregister_sysctl_table(struct ctl_table_header * header)
{
int nr_subheaders;
might_sleep();
if (header == NULL)
return;
nr_subheaders = count_subheaders(header->ctl_table_arg);
if (unlikely(nr_subheaders > 1)) {
struct ctl_table_header **subheaders;
int i;
subheaders = (struct ctl_table_header **)(header + 1);
for (i = nr_subheaders -1; i >= 0; i--) {
struct ctl_table_header *subh = subheaders[i];
struct ctl_table *table = subh->ctl_table_arg;
unregister_sysctl_table(subh);
kfree(table);
}
kfree(header);
return;
}
spin_lock(&sysctl_lock);
drop_sysctl_table(header);
spin_unlock(&sysctl_lock);
}
EXPORT_SYMBOL(unregister_sysctl_table);
void setup_sysctl_set(struct ctl_table_set *set,
struct ctl_table_root *root,
int (*is_seen)(struct ctl_table_set *))
{
memset(set, 0, sizeof(*set));
set->is_seen = is_seen;
init_header(&set->dir.header, root, set, NULL, root_table);
}
void retire_sysctl_set(struct ctl_table_set *set)
{
WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
}
int __init proc_sys_init(void)
{
struct proc_dir_entry *proc_sys_root;
proc_sys_root = proc_mkdir("sys", NULL);
proc_sys_root->proc_iops = &proc_sys_dir_operations;
proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
return sysctl_init();
}