Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull user namespace changes from Eric Biederman: "While small this set of changes is very significant with respect to containers in general and user namespaces in particular. The user space interface is now complete. This set of changes adds support for unprivileged users to create user namespaces and as a user namespace root to create other namespaces. The tyranny of supporting suid root preventing unprivileged users from using cool new kernel features is broken. This set of changes completes the work on setns, adding support for the pid, user, mount namespaces. This set of changes includes a bunch of basic pid namespace cleanups/simplifications. Of particular significance is the rework of the pid namespace cleanup so it no longer requires sending out tendrils into all kinds of unexpected cleanup paths for operation. At least one case of broken error handling is fixed by this cleanup. The files under /proc/<pid>/ns/ have been converted from regular files to magic symlinks which prevents incorrect caching by the VFS, ensuring the files always refer to the namespace the process is currently using and ensuring that the ptrace_mayaccess permission checks are always applied. The files under /proc/<pid>/ns/ have been given stable inode numbers so it is now possible to see if different processes share the same namespaces. Through the David Miller's net tree are changes to relax many of the permission checks in the networking stack to allowing the user namespace root to usefully use the networking stack. Similar changes for the mount namespace and the pid namespace are coming through my tree. Two small changes to add user namespace support were commited here adn in David Miller's -net tree so that I could complete the work on the /proc/<pid>/ns/ files in this tree. Work remains to make it safe to build user namespaces and 9p, afs, ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the Kconfig guard remains in place preventing that user namespaces from being built when any of those filesystems are enabled. Future design work remains to allow root users outside of the initial user namespace to mount more than just /proc and /sys." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits) proc: Usable inode numbers for the namespace file descriptors. proc: Fix the namespace inode permission checks. proc: Generalize proc inode allocation userns: Allow unprivilged mounts of proc and sysfs userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file procfs: Print task uids and gids in the userns that opened the proc file userns: Implement unshare of the user namespace userns: Implent proc namespace operations userns: Kill task_user_ns userns: Make create_new_namespaces take a user_ns parameter userns: Allow unprivileged use of setns. userns: Allow unprivileged users to create new namespaces userns: Allow setting a userns mapping to your current uid. userns: Allow chown and setgid preservation userns: Allow unprivileged users to create user namespaces. userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped userns: fix return value on mntns_install() failure vfs: Allow unprivileged manipulation of the mount namespace. vfs: Only support slave subtrees across different user namespaces vfs: Add a user namespace reference from struct mnt_namespace ...
2012-12-17 15:44:47 -08:00 · 2012-12-17 15:44:47 -08:00 · 6a2b60b17b
commit 6a2b60b17b
parent 9228ff9038 98f842e675
59 changed files with 996 additions and 451 deletions
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@ -1094,7 +1094,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		LOAD_INT(c), LOAD_FRAC(c),
 		count_active_contexts(),
 		atomic_read(&nr_spu_contexts),
-		current->nsproxy->pid_ns->last_pid);
+		task_active_pid_ns(current)->last_pid);
 	return 0;
 }

--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req)

 void mconsole_proc(struct mc_request *req)
 {
-	struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
+	struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
 	char *buf;
 	int len;
 	struct file *file;
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@ -35,6 +35,7 @@
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/pid_namespace.h>

 #include "binder.h"
 #include "binder_trace.h"
@ -2320,7 +2321,7 @@ retry:
 		if (t->from) {
 			struct task_struct *sender = t->from->proc->tsk;
 			tr.sender_pid = task_tgid_nr_ns(sender,
-							current->nsproxy->pid_ns);
+							task_active_pid_ns(current));
 		} else {
 			tr.sender_pid = 0;
 		}
--- a/fs/attr.c
+++ b/fs/attr.c
@ -49,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 	/* Make sure a caller can chown. */
 	if ((ia_valid & ATTR_UID) &&
 	    (!uid_eq(current_fsuid(), inode->i_uid) ||
-	     !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN))
+	     !uid_eq(attr->ia_uid, inode->i_uid)) &&
+	    !inode_capable(inode, CAP_CHOWN))
 		return -EPERM;

 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
 	    (!uid_eq(current_fsuid(), inode->i_uid) ||
 	    (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
-	    !capable(CAP_CHOWN))
+	    !inode_capable(inode, CAP_CHOWN))
 		return -EPERM;

 	/* Make sure a caller can chmod. */
@ -65,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
-				inode->i_gid) && !capable(CAP_FSETID))
+				inode->i_gid) &&
+		    !inode_capable(inode, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}

@ -157,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;

-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+		if (!in_group_p(inode->i_gid) &&
+		    !inode_capable(inode, CAP_FSETID))
 			mode &= ~S_ISGID;
 		inode->i_mode = mode;
 	}
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@ -74,8 +74,8 @@ struct autofs_info {
 	unsigned long last_used;
 	atomic_t count;

-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 };

 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
@ -89,8 +89,8 @@ struct autofs_wait_queue {
 	struct qstr name;
 	u32 dev;
 	u64 ino;
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 	pid_t pid;
 	pid_t tgid;
 	/* This is for status reporting upon return */
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		err = 0;
 		autofs4_expire_wait(path.dentry);
 		spin_lock(&sbi->fs_lock);
-		param->requester.uid = ino->uid;
-		param->requester.gid = ino->gid;
+		param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
+		param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
 		spin_unlock(&sbi->fs_lock);
 	}
 	path_put(&path);
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)

 void autofs4_clean_ino(struct autofs_info *ino)
 {
-	ino->uid = 0;
-	ino->gid = 0;
+	ino->uid = GLOBAL_ROOT_UID;
+	ino->gid = GLOBAL_ROOT_GID;
 	ino->last_used = jiffies;
 }

@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 		return 0;

 	seq_printf(m, ",fd=%d", sbi->pipefd);
-	if (root_inode->i_uid != 0)
-		seq_printf(m, ",uid=%u", root_inode->i_uid);
-	if (root_inode->i_gid != 0)
-		seq_printf(m, ",gid=%u", root_inode->i_gid);
+	if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			from_kuid_munged(&init_user_ns, root_inode->i_uid));
+	if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			from_kgid_munged(&init_user_ns, root_inode->i_gid));
 	seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
 	seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
 	seq_printf(m, ",minproto=%d", sbi->min_proto);
@ -126,7 +128,7 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };

-static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
+static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
 		pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
 {
 	char *p;
@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 		case Opt_uid:
 			if (match_int(args, &option))
 				return 1;
-			*uid = option;
+			*uid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(*uid))
+				return 1;
 			break;
 		case Opt_gid:
 			if (match_int(args, &option))
 				return 1;
-			*gid = option;
+			*gid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(*gid))
+				return 1;
 			break;
 		case Opt_pgrp:
 			if (match_int(args, &option))
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	case autofs_ptype_expire_direct:
 	{
 		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
+		struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;

 		pktsz = sizeof(*packet);

@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		packet->name[wq->name.len] = '\0';
 		packet->dev = wq->dev;
 		packet->ino = wq->ino;
-		packet->uid = wq->uid;
-		packet->gid = wq->gid;
+		packet->uid = from_kuid_munged(user_ns, wq->uid);
+		packet->gid = from_kgid_munged(user_ns, wq->gid);
 		packet->pid = wq->pid;
 		packet->tgid = wq->tgid;
 		break;
--- a/fs/exec.c
+++ b/fs/exec.c
@ -1266,14 +1266,13 @@ int prepare_binprm(struct linux_binprm *bprm)
 	bprm->cred->egid = current_egid();

 	if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
-	    !current->no_new_privs) {
+	    !current->no_new_privs &&
+	    kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
+	    kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
 		/* Set-uid? */
 		if (mode & S_ISUID) {
-			if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
-				return -EPERM;
 			bprm->per_clear |= PER_CLEAR_ON_SETID;
 			bprm->cred->euid = inode->i_uid;
-
 		}

 		/* Set-gid? */
@ -1283,8 +1282,6 @@ int prepare_binprm(struct linux_binprm *bprm)
 		 * executable.
 		 */
 		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-			if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
-				return -EPERM;
 			bprm->per_clear |= PER_CLEAR_ON_SETID;
 			bprm->cred->egid = inode->i_gid;
 		}
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req)

 static void fuse_req_init_context(struct fuse_req *req)
 {
-	req->in.h.uid = current_fsuid();
-	req->in.h.gid = current_fsgid();
+	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
+	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
 	req->in.h.pid = current->pid;
 }

--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 	stat->ino = attr->ino;
 	stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
 	stat->nlink = attr->nlink;
-	stat->uid = attr->uid;
-	stat->gid = attr->gid;
+	stat->uid = make_kuid(&init_user_ns, attr->uid);
+	stat->gid = make_kgid(&init_user_ns, attr->gid);
 	stat->rdev = inode->i_rdev;
 	stat->atime.tv_sec = attr->atime;
 	stat->atime.tv_nsec = attr->atimensec;
@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 	rcu_read_lock();
 	ret = 0;
 	cred = __task_cred(task);
-	if (cred->euid == fc->user_id &&
-	    cred->suid == fc->user_id &&
-	    cred->uid  == fc->user_id &&
-	    cred->egid == fc->group_id &&
-	    cred->sgid == fc->group_id &&
-	    cred->gid  == fc->group_id)
+	if (uid_eq(cred->euid, fc->user_id) &&
+	    uid_eq(cred->suid, fc->user_id) &&
+	    uid_eq(cred->uid,  fc->user_id) &&
+	    gid_eq(cred->egid, fc->group_id) &&
+	    gid_eq(cred->sgid, fc->group_id) &&
+	    gid_eq(cred->gid,  fc->group_id))
 		ret = 1;
 	rcu_read_unlock();

@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
 	if (ivalid & ATTR_MODE)
 		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
 	if (ivalid & ATTR_UID)
-		arg->valid |= FATTR_UID,    arg->uid = iattr->ia_uid;
+		arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
 	if (ivalid & ATTR_GID)
-		arg->valid |= FATTR_GID,    arg->gid = iattr->ia_gid;
+		arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
 	if (ivalid & ATTR_SIZE)
 		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
 	if (ivalid & ATTR_ATIME) {
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@ -333,10 +333,10 @@ struct fuse_conn {
 	atomic_t count;

 	/** The user id for this mount */
-	uid_t user_id;
+	kuid_t user_id;

 	/** The group id for this mount */
-	gid_t group_id;
+	kgid_t group_id;

 	/** The fuse mount flags for this mount */
 	unsigned flags;
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh,
 struct fuse_mount_data {
 	int fd;
 	unsigned rootmode;
-	unsigned user_id;
-	unsigned group_id;
+	kuid_t user_id;
+	kgid_t group_id;
 	unsigned fd_present:1;
 	unsigned rootmode_present:1;
 	unsigned user_id_present:1;
@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	inode->i_ino     = fuse_squash_ino(attr->ino);
 	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
 	set_nlink(inode, attr->nlink);
-	inode->i_uid     = attr->uid;
-	inode->i_gid     = attr->gid;
+	inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
+	inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
 	inode->i_blocks  = attr->blocks;
 	inode->i_atime.tv_sec   = attr->atime;
 	inode->i_atime.tv_nsec  = attr->atimensec;
@ -492,14 +492,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 		case OPT_USER_ID:
 			if (match_int(&args[0], &value))
 				return 0;
-			d->user_id = value;
+			d->user_id = make_kuid(current_user_ns(), value);
+			if (!uid_valid(d->user_id))
+				return 0;
 			d->user_id_present = 1;
 			break;

 		case OPT_GROUP_ID:
 			if (match_int(&args[0], &value))
 				return 0;
-			d->group_id = value;
+			d->group_id = make_kgid(current_user_ns(), value);
+			if (!gid_valid(d->group_id))
+				return 0;
 			d->group_id_present = 1;
 			break;

@ -540,8 +544,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
 	struct super_block *sb = root->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);

-	seq_printf(m, ",user_id=%u", fc->user_id);
-	seq_printf(m, ",group_id=%u", fc->group_id);
+	seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
+	seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
 		seq_puts(m, ",default_permissions");
 	if (fc->flags & FUSE_ALLOW_OTHER)
@ -989,7 +993,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!file)
 		goto err;

-	if (file->f_op != &fuse_dev_operations)
+	if ((file->f_op != &fuse_dev_operations) ||
+	    (file->f_cred->user_ns != &init_user_ns))
 		goto err_fput;

 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	struct vfsmount *proc_mnt;
 	int err = -ENOENT;

-	proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
+	proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
 	if (IS_ERR(proc_mnt))
 		goto out;

--- a/fs/mount.h
+++ b/fs/mount.h
@ -4,8 +4,11 @@

 struct mnt_namespace {
 	atomic_t		count;
+	unsigned int		proc_inum;
 	struct mount *	root;
 	struct list_head	list;
+	struct user_namespace	*user_ns;
+	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
 	int event;
 };
--- a/fs/namespace.c
+++ b/fs/namespace.c
@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <linux/capability.h>
 #include <linux/mnt_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
@ -20,6 +21,7 @@
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
+#include <linux/proc_fs.h>
 #include "pnode.h"
 #include "internal.h"

@ -784,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);

-	if (flag & (CL_SLAVE | CL_PRIVATE))
+	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
 		mnt->mnt_group_id = 0; /* not a peer of original */
 	else
 		mnt->mnt_group_id = old->mnt_group_id;
@ -805,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
 	br_write_unlock(&vfsmount_lock);

-	if (flag & CL_SLAVE) {
+	if ((flag & CL_SLAVE) ||
+	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
 		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
 		mnt->mnt_master = old;
 		CLEAR_MNT_SHARED(mnt);
@ -1266,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 		goto dput_and_out;

 	retval = -EPERM;
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		goto dput_and_out;

 	retval = do_umount(mnt, flags);
@ -1292,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)

 static int mount_is_safe(struct path *path)
 {
-	if (capable(CAP_SYS_ADMIN))
+	if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return 0;
 	return -EPERM;
 #ifdef notyet
@ -1308,6 +1311,26 @@ static int mount_is_safe(struct path *path)
 #endif
 }

+static bool mnt_ns_loop(struct path *path)
+{
+	/* Could bind mounting the mount namespace inode cause a
+	 * mount namespace loop?
+	 */
+	struct inode *inode = path->dentry->d_inode;
+	struct proc_inode *ei;
+	struct mnt_namespace *mnt_ns;
+
+	if (!proc_ns_inode(inode))
+		return false;
+
+	ei = PROC_I(inode);
+	if (ei->ns_ops != &mntns_operations)
+		return false;
+
+	mnt_ns = ei->ns;
+	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
+}
+
 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 					int flag)
 {
@ -1610,7 +1633,7 @@ static int do_change_type(struct path *path, int flag)
 	int type;
 	int err = 0;

-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;

 	if (path->dentry != path->mnt->mnt_root)
@ -1655,6 +1678,10 @@ static int do_loopback(struct path *path, const char *old_name,
 	if (err)
 		return err;

+	err = -EINVAL;
+	if (mnt_ns_loop(&old_path))
+		goto out; 
+
 	err = lock_mount(path);
 	if (err)
 		goto out;
@ -1770,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name)
 	struct mount *p;
 	struct mount *old;
 	int err = 0;
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 	if (!old_name || !*old_name)
 		return -EINVAL;
@ -1857,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 	return ERR_PTR(err);
 }

-static struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
-{
-	struct file_system_type *type = get_fs_type(fstype);
-	struct vfsmount *mnt;
-	if (!type)
-		return ERR_PTR(-ENODEV);
-	mnt = vfs_kern_mount(type, flags, name, data);
-	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-	    !mnt->mnt_sb->s_subtype)
-		mnt = fs_set_subtype(mnt, fstype);
-	put_filesystem(type);
-	return mnt;
-}
-
 /*
 * add a mount into a namespace's mount tree
 */
@ -1917,20 +1929,46 @@ unlock:
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
-static int do_new_mount(struct path *path, const char *type, int flags,
+static int do_new_mount(struct path *path, const char *fstype, int flags,
 			int mnt_flags, const char *name, void *data)
 {
+	struct file_system_type *type;
+	struct user_namespace *user_ns;
 	struct vfsmount *mnt;
 	int err;

-	if (!type)
+	if (!fstype)
 		return -EINVAL;

 	/* we need capabilities... */
-	if (!capable(CAP_SYS_ADMIN))
+	user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;

-	mnt = do_kern_mount(type, flags, name, data);
+	type = get_fs_type(fstype);
+	if (!type)
+		return -ENODEV;
+
+	if (user_ns != &init_user_ns) {
+		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
+			put_filesystem(type);
+			return -EPERM;
+		}
+		/* Only in special cases allow devices from mounts
+		 * created outside the initial user namespace.
+		 */
+		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+			flags |= MS_NODEV;
+			mnt_flags |= MNT_NODEV;
+		}
+	}
+
+	mnt = vfs_kern_mount(type, flags, name, data);
+	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+	    !mnt->mnt_sb->s_subtype)
+		mnt = fs_set_subtype(mnt, fstype);
+
+	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);

@ -2261,18 +2299,42 @@ dput_out:
 	return retval;
 }

-static struct mnt_namespace *alloc_mnt_ns(void)
+static void free_mnt_ns(struct mnt_namespace *ns)
+{
+	proc_free_inum(ns->proc_inum);
+	put_user_ns(ns->user_ns);
+	kfree(ns);
+}
+
+/*
+ * Assign a sequence number so we can detect when we attempt to bind
+ * mount a reference to an older mount namespace into the current
+ * mount namespace, preventing reference counting loops.  A 64bit
+ * number incrementing at 10Ghz will take 12,427 years to wrap which
+ * is effectively never, so we can ignore the possibility.
+ */
+static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
+
+static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
 {
 	struct mnt_namespace *new_ns;
+	int ret;

 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
 	if (!new_ns)
 		return ERR_PTR(-ENOMEM);
+	ret = proc_alloc_inum(&new_ns->proc_inum);
+	if (ret) {
+		kfree(new_ns);
+		return ERR_PTR(ret);
+	}
+	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
 	atomic_set(&new_ns->count, 1);
 	new_ns->root = NULL;
 	INIT_LIST_HEAD(&new_ns->list);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->event = 0;
+	new_ns->user_ns = get_user_ns(user_ns);
 	return new_ns;
 }

@ -2281,24 +2343,28 @@ static struct mnt_namespace *alloc_mnt_ns(void)
 * copied from the namespace of the passed in task structure.
 */
 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
-		struct fs_struct *fs)
+		struct user_namespace *user_ns, struct fs_struct *fs)
 {
 	struct mnt_namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct mount *p, *q;
 	struct mount *old = mnt_ns->root;
 	struct mount *new;
+	int copy_flags;

-	new_ns = alloc_mnt_ns();
+	new_ns = alloc_mnt_ns(user_ns);
 	if (IS_ERR(new_ns))
 		return new_ns;

 	down_write(&namespace_sem);
 	/* First pass: copy the tree topology */
-	new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE);
+	copy_flags = CL_COPY_ALL | CL_EXPIRE;
+	if (user_ns != mnt_ns->user_ns)
+		copy_flags |= CL_SHARED_TO_SLAVE;
+	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
 		up_write(&namespace_sem);
-		kfree(new_ns);
+		free_mnt_ns(new_ns);
 		return ERR_CAST(new);
 	}
 	new_ns->root = new;
@ -2339,7 +2405,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 }

 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
-		struct fs_struct *new_fs)
+		struct user_namespace *user_ns, struct fs_struct *new_fs)
 {
 	struct mnt_namespace *new_ns;

@ -2349,7 +2415,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	if (!(flags & CLONE_NEWNS))
 		return ns;

-	new_ns = dup_mnt_ns(ns, new_fs);
+	new_ns = dup_mnt_ns(ns, user_ns, new_fs);

 	put_mnt_ns(ns);
 	return new_ns;
@ -2361,7 +2427,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 */
 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 {
-	struct mnt_namespace *new_ns = alloc_mnt_ns();
+	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
 	if (!IS_ERR(new_ns)) {
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
@ -2501,7 +2567,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	struct mount *new_mnt, *root_mnt;
 	int error;

-	if (!capable(CAP_SYS_ADMIN))
+	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;

 	error = user_path_dir(new_root, &new);
@ -2583,8 +2649,13 @@ static void __init init_mount_tree(void)
 	struct vfsmount *mnt;
 	struct mnt_namespace *ns;
 	struct path root;
+	struct file_system_type *type;

-	mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
+	type = get_fs_type("rootfs");
+	if (!type)
+		panic("Can't find rootfs type");
+	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
+	put_filesystem(type);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");

@ -2647,7 +2718,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	br_write_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
-	kfree(ns);
+	free_mnt_ns(ns);
 }

 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
@ -2681,3 +2752,71 @@ bool our_mnt(struct vfsmount *mnt)
 {
 	return check_mnt(real_mount(mnt));
 }
+
+static void *mntns_get(struct task_struct *task)
+{
+	struct mnt_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	rcu_read_lock();
+	nsproxy = task_nsproxy(task);
+	if (nsproxy) {
+		ns = nsproxy->mnt_ns;
+		get_mnt_ns(ns);
+	}
+	rcu_read_unlock();
+
+	return ns;
+}
+
+static void mntns_put(void *ns)
+{
+	put_mnt_ns(ns);
+}
+
+static int mntns_install(struct nsproxy *nsproxy, void *ns)
+{
+	struct fs_struct *fs = current->fs;
+	struct mnt_namespace *mnt_ns = ns;
+	struct path root;
+
+	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
+	    !nsown_capable(CAP_SYS_CHROOT))
+		return -EPERM;
+
+	if (fs->users != 1)
+		return -EINVAL;
+
+	get_mnt_ns(mnt_ns);
+	put_mnt_ns(nsproxy->mnt_ns);
+	nsproxy->mnt_ns = mnt_ns;
+
+	/* Find the root */
+	root.mnt    = &mnt_ns->root->mnt;
+	root.dentry = mnt_ns->root->mnt.mnt_root;
+	path_get(&root);
+	while(d_mountpoint(root.dentry) && follow_down_one(&root))
+		;
+
+	/* Update the pwd and root */
+	set_fs_pwd(fs, &root);
+	set_fs_root(fs, &root);
+
+	path_put(&root);
+	return 0;
+}
+
+static unsigned int mntns_inum(void *ns)
+{
+	struct mnt_namespace *mnt_ns = ns;
+	return mnt_ns->proc_inum;
+}
+
+const struct proc_ns_operations mntns_operations = {
+	.name		= "mnt",
+	.type		= CLONE_NEWNS,
+	.get		= mntns_get,
+	.put		= mntns_put,
+	.install	= mntns_install,
+	.inum		= mntns_inum,
+};
--- a/fs/open.c
+++ b/fs/open.c
@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
 		goto dput_and_out;

 	error = -EPERM;
-	if (!capable(CAP_SYS_CHROOT))
+	if (!nsown_capable(CAP_SYS_CHROOT))
 		goto dput_and_out;
 	error = security_path_chroot(&path);
 	if (error)
--- a/fs/pnode.h
+++ b/fs/pnode.h
@ -22,6 +22,7 @@
 #define CL_COPY_ALL 		0x04
 #define CL_MAKE_SHARED 		0x08
 #define CL_PRIVATE 		0x10
+#define CL_SHARED_TO_SLAVE	0x20

 static inline void set_mnt_shared(struct mount *mnt)
 {
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@ -21,6 +21,7 @@ proc-y	+= uptime.o
 proc-y	+= version.o
 proc-y	+= softirqs.o
 proc-y	+= namespaces.o
+proc-y	+= self.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *p)
 {
-	struct user_namespace *user_ns = current_user_ns();
+	struct user_namespace *user_ns = seq_user_ns(m);
 	struct group_info *group_info;
 	int g;
 	struct fdtable *fdt = NULL;
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@ -2345,146 +2345,6 @@ static const struct file_operations proc_coredump_filter_operations = {
 };
 #endif

-/*
- * /proc/self:
- */
-static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
-			      int buflen)
-{
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char tmp[PROC_NUMBUF];
-	if (!tgid)
-		return -ENOENT;
-	sprintf(tmp, "%d", tgid);
-	return vfs_readlink(dentry,buffer,buflen,tmp);
-}
-
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char *name = ERR_PTR(-ENOENT);
-	if (tgid) {
-		/* 11 for max length of signed int in decimal + NULL term */
-		name = kmalloc(12, GFP_KERNEL);
-		if (!name)
-			name = ERR_PTR(-ENOMEM);
-		else
-			sprintf(name, "%d", tgid);
-	}
-	nd_set_link(nd, name);
-	return NULL;
-}
-
-static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
-				void *cookie)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		kfree(s);
-}
-
-static const struct inode_operations proc_self_inode_operations = {
-	.readlink	= proc_self_readlink,
-	.follow_link	= proc_self_follow_link,
-	.put_link	= proc_self_put_link,
-};
-
-/*
- * proc base
- *
- * These are the directory entries in the root directory of /proc
- * that properly belong to the /proc filesystem, as they describe
- * describe something that is process related.
- */
-static const struct pid_entry proc_base_stuff[] = {
-	NOD("self", S_IFLNK|S_IRWXUGO,
-		&proc_self_inode_operations, NULL, {}),
-};
-
-static struct dentry *proc_base_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-	const struct pid_entry *p = ptr;
-	struct inode *inode;
-	struct proc_inode *ei;
-	struct dentry *error;
-
-	/* Allocate the inode */
-	error = ERR_PTR(-ENOMEM);
-	inode = new_inode(dir->i_sb);
-	if (!inode)
-		goto out;
-
-	/* Initialize the inode */
-	ei = PROC_I(inode);
-	inode->i_ino = get_next_ino();
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-
-	/*
-	 * grab the reference to the task.
-	 */
-	ei->pid = get_task_pid(task, PIDTYPE_PID);
-	if (!ei->pid)
-		goto out_iput;
-
-	inode->i_mode = p->mode;
-	if (S_ISDIR(inode->i_mode))
-		set_nlink(inode, 2);
-	if (S_ISLNK(inode->i_mode))
-		inode->i_size = 64;
-	if (p->iop)
-		inode->i_op = p->iop;
-	if (p->fop)
-		inode->i_fop = p->fop;
-	ei->op = p->op;
-	d_add(dentry, inode);
-	error = NULL;
-out:
-	return error;
-out_iput:
-	iput(inode);
-	goto out;
-}
-
-static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
-{
-	struct dentry *error;
-	struct task_struct *task = get_proc_task(dir);
-	const struct pid_entry *p, *last;
-
-	error = ERR_PTR(-ENOENT);
-
-	if (!task)
-		goto out_no_task;
-
-	/* Lookup the directory entry */
-	last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
-	for (p = proc_base_stuff; p <= last; p++) {
-		if (p->len != dentry->d_name.len)
-			continue;
-		if (!memcmp(dentry->d_name.name, p->name, p->len))
-			break;
-	}
-	if (p > last)
-		goto out;
-
-	error = proc_base_instantiate(dir, dentry, task, p);
-
-out:
-	put_task_struct(task);
-out_no_task:
-	return error;
-}
-
-static int proc_base_fill_cache(struct file *filp, void *dirent,
-	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
-	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-				proc_base_instantiate, task, p);
-}
-
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
@ -2839,10 +2699,6 @@ void proc_flush_task(struct task_struct *task)
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }

 static struct dentry *proc_pid_instantiate(struct inode *dir,
@ -2876,15 +2732,11 @@ out:

 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-	struct dentry *result;
+	struct dentry *result = NULL;
 	struct task_struct *task;
 	unsigned tgid;
 	struct pid_namespace *ns;

-	result = proc_base_lookup(dir, dentry);
-	if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
-		goto out;
-
 	tgid = name_to_int(dentry);
 	if (tgid == ~0U)
 		goto out;
@ -2947,7 +2799,7 @@ retry:
 	return iter;
 }

-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY)

 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	struct tgid_iter iter)
@ -2967,25 +2819,12 @@ static int fake_filldir(void *buf, const char *name, int namelen,
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	unsigned int nr;
-	struct task_struct *reaper;
 	struct tgid_iter iter;
 	struct pid_namespace *ns;
 	filldir_t __filldir;

 	if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
-		goto out_no_task;
-	nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-
-	reaper = get_proc_task(filp->f_path.dentry->d_inode);
-	if (!reaper)
-		goto out_no_task;
-
-	for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
-		const struct pid_entry *p = &proc_base_stuff[nr];
-		if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
 		goto out;
-	}

 	ns = filp->f_dentry->d_sb->s_fs_info;
 	iter.task = NULL;
@ -3006,8 +2845,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
 out:
-	put_task_struct(reaper);
-out_no_task:
 	return 0;
 }

--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
 */
-static unsigned int get_inode_number(void)
+int proc_alloc_inum(unsigned int *inum)
 {
 	unsigned int i;
 	int error;

 retry:
-	if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
-		return 0;
+	if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
+		return -ENOMEM;

 	spin_lock(&proc_inum_lock);
 	error = ida_get_new(&proc_inum_ida, &i);
@ -365,18 +365,19 @@ retry:
 	if (error == -EAGAIN)
 		goto retry;
 	else if (error)
-		return 0;
+		return error;

 	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
 		spin_lock(&proc_inum_lock);
 		ida_remove(&proc_inum_ida, i);
 		spin_unlock(&proc_inum_lock);
+		return -ENOSPC;
+	}
+	*inum = PROC_DYNAMIC_FIRST + i;
 	return 0;
 }
-	return PROC_DYNAMIC_FIRST + i;
-}

-static void release_inode_number(unsigned int inum)
+void proc_free_inum(unsigned int inum)
 {
 	spin_lock(&proc_inum_lock);
 	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = {

 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
-	unsigned int i;
 	struct proc_dir_entry *tmp;
+	int ret;
 	
-	i = get_inode_number();
-	if (i == 0)
-		return -EAGAIN;
-	dp->low_ino = i;
+	ret = proc_alloc_inum(&dp->low_ino);
+	if (ret)
+		return ret;

 	if (S_ISDIR(dp->mode)) {
 		if (dp->proc_iops == NULL) {
@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data);

 static void free_proc_entry(struct proc_dir_entry *de)
 {
-	release_inode_number(de->low_ino);
+	proc_free_inum(de->low_ino);

 	if (S_ISLNK(de->mode))
 		kfree(de->data);
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode)
 	struct proc_dir_entry *de;
 	struct ctl_table_header *head;
 	const struct proc_ns_operations *ns_ops;
+	void *ns;

 	truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode)
 	}
 	/* Release any associated namespace */
 	ns_ops = PROC_I(inode)->ns_ops;
-	if (ns_ops && ns_ops->put)
-		ns_ops->put(PROC_I(inode)->ns);
+	ns = PROC_I(inode)->ns;
+	if (ns_ops && ns)
+		ns_ops->put(ns);
 }

 static struct kmem_cache * proc_inode_cachep;
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@ -15,6 +15,7 @@ struct  ctl_table_header;
 struct  mempolicy;

 extern struct proc_dir_entry proc_root;
+extern void proc_self_init(void);
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
 extern void sysctl_head_put(struct ctl_table_header *head);
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@ -11,6 +11,7 @@
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include "internal.h"


@ -24,12 +25,168 @@ static const struct proc_ns_operations *ns_entries[] = {
 #ifdef CONFIG_IPC_NS
 	&ipcns_operations,
 #endif
+#ifdef CONFIG_PID_NS
+	&pidns_operations,
+#endif
+#ifdef CONFIG_USER_NS
+	&userns_operations,
+#endif
+	&mntns_operations,
 };

 static const struct file_operations ns_file_operations = {
 	.llseek		= no_llseek,
 };

+static const struct inode_operations ns_inode_operations = {
+	.setattr	= proc_setattr,
+};
+
+static int ns_delete_dentry(const struct dentry *dentry)
+{
+	/* Don't cache namespace inodes when not in use */
+	return 1;
+}
+
+static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+
+	return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+		ns_ops->name, inode->i_ino);
+}
+
+const struct dentry_operations ns_dentry_operations =
+{
+	.d_delete	= ns_delete_dentry,
+	.d_dname	= ns_dname,
+};
+
+static struct dentry *proc_ns_get_dentry(struct super_block *sb,
+	struct task_struct *task, const struct proc_ns_operations *ns_ops)
+{
+	struct dentry *dentry, *result;
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct qstr qname = { .name = "", };
+	void *ns;
+
+	ns = ns_ops->get(task);
+	if (!ns)
+		return ERR_PTR(-ENOENT);
+
+	dentry = d_alloc_pseudo(sb, &qname);
+	if (!dentry) {
+		ns_ops->put(ns);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode = iget_locked(sb, ns_ops->inum(ns));
+	if (!inode) {
+		dput(dentry);
+		ns_ops->put(ns);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ei = PROC_I(inode);
+	if (inode->i_state & I_NEW) {
+		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		inode->i_op = &ns_inode_operations;
+		inode->i_mode = S_IFREG | S_IRUGO;
+		inode->i_fop = &ns_file_operations;
+		ei->ns_ops = ns_ops;
+		ei->ns = ns;
+		unlock_new_inode(inode);
+	} else {
+		ns_ops->put(ns);
+	}
+
+	d_set_d_op(dentry, &ns_dentry_operations);
+	result = d_instantiate_unique(dentry, inode);
+	if (result) {
+		dput(dentry);
+		dentry = result;
+	}
+
+	return dentry;
+}
+
+static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct proc_inode *ei = PROC_I(inode);
+	struct task_struct *task;
+	struct dentry *ns_dentry;
+	void *error = ERR_PTR(-EACCES);
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_put_task;
+
+	ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
+	if (IS_ERR(ns_dentry)) {
+		error = ERR_CAST(ns_dentry);
+		goto out_put_task;
+	}
+
+	dput(nd->path.dentry);
+	nd->path.dentry = ns_dentry;
+	error = NULL;
+
+out_put_task:
+	put_task_struct(task);
+out:
+	return error;
+}
+
+static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	struct proc_inode *ei = PROC_I(inode);
+	const struct proc_ns_operations *ns_ops = ei->ns_ops;
+	struct task_struct *task;
+	void *ns;
+	char name[50];
+	int len = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_put_task;
+
+	len = -ENOENT;
+	ns = ns_ops->get(task);
+	if (!ns)
+		goto out_put_task;
+
+	snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
+	len = strlen(name);
+
+	if (len > buflen)
+		len = buflen;
+	if (copy_to_user(buffer, name, len))
+		len = -EFAULT;
+
+	ns_ops->put(ns);
+out_put_task:
+	put_task_struct(task);
+out:
+	return len;
+}
+
+static const struct inode_operations proc_ns_link_inode_operations = {
+	.readlink	= proc_ns_readlink,
+	.follow_link	= proc_ns_follow_link,
+	.setattr	= proc_setattr,
+};
+
 static struct dentry *proc_ns_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@ -37,21 +194,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 	struct inode *inode;
 	struct proc_inode *ei;
 	struct dentry *error = ERR_PTR(-ENOENT);
-	void *ns;

 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
 		goto out;

-	ns = ns_ops->get(task);
-	if (!ns)
-		goto out_iput;
-
 	ei = PROC_I(inode);
-	inode->i_mode = S_IFREG|S_IRUSR;
-	inode->i_fop  = &ns_file_operations;
+	inode->i_mode = S_IFLNK|S_IRWXUGO;
+	inode->i_op = &proc_ns_link_inode_operations;
 	ei->ns_ops = ns_ops;
-	ei->ns	      = ns;

 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_add(dentry, inode);
@ -60,9 +211,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 		error = NULL;
 out:
 	return error;
-out_iput:
-	iput(inode);
-	goto out;
 }

 static int proc_ns_fill_cache(struct file *filp, void *dirent,
@ -89,10 +237,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent,
 	if (!task)
 		goto out_no_task;

-	ret = -EPERM;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
 	ret = 0;
 	i = filp->f_pos;
 	switch (i) {
@ -152,10 +296,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 	if (!task)
 		goto out_no_task;

-	error = ERR_PTR(-EPERM);
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
 	last = &ns_entries[ARRAY_SIZE(ns_entries)];
 	for (entry = ns_entries; entry < last; entry++) {
 		if (strlen((*entry)->name) != len)
@ -163,7 +303,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 		if (!memcmp(dentry->d_name.name, (*entry)->name, len))
 			break;
 	}
-	error = ERR_PTR(-ENOENT);
 	if (entry == last)
 		goto out;

@ -198,3 +337,7 @@ out_invalid:
 	return ERR_PTR(-EINVAL);
 }

+bool proc_ns_inode(struct inode *inode)
+{
+	return inode->i_fop == &ns_file_operations;
+}
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@ -100,14 +100,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	int err;
 	struct super_block *sb;
 	struct pid_namespace *ns;
-	struct proc_inode *ei;
 	char *options;

 	if (flags & MS_KERNMOUNT) {
 		ns = (struct pid_namespace *)data;
 		options = NULL;
 	} else {
-		ns = current->nsproxy->pid_ns;
+		ns = task_active_pid_ns(current);
 		options = data;
 	}

@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		sb->s_flags |= MS_ACTIVE;
 	}

-	ei = PROC_I(sb->s_root->d_inode);
-	if (!ei->pid) {
-		rcu_read_lock();
-		ei->pid = get_pid(find_pid_ns(1, ns));
-		rcu_read_unlock();
-	}
-
 	return dget(sb->s_root);
 }

@ -153,6 +145,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
+	.fs_flags	= FS_USERNS_MOUNT,
 };

 void __init proc_root_init(void)
@ -163,12 +156,8 @@ void __init proc_root_init(void)
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 		return;
-	err = pid_ns_prepare_proc(&init_pid_ns);
-	if (err) {
-		unregister_filesystem(&proc_fs_type);
-		return;
-	}

+	proc_self_init();
 	proc_symlink("mounts", NULL, "self/mounts");

 	proc_net_init();
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@ -0,0 +1,59 @@
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+
+/*
+ * /proc/self:
+ */
+static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
+			      int buflen)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char tmp[PROC_NUMBUF];
+	if (!tgid)
+		return -ENOENT;
+	sprintf(tmp, "%d", tgid);
+	return vfs_readlink(dentry,buffer,buflen,tmp);
+}
+
+static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char *name = ERR_PTR(-ENOENT);
+	if (tgid) {
+		/* 11 for max length of signed int in decimal + NULL term */
+		name = kmalloc(12, GFP_KERNEL);
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d", tgid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
+}
+
+static const struct inode_operations proc_self_inode_operations = {
+	.readlink	= proc_self_readlink,
+	.follow_link	= proc_self_follow_link,
+	.put_link	= proc_self_put_link,
+};
+
+void __init proc_self_init(void)
+{
+	struct proc_dir_entry *proc_self_symlink;
+	mode_t mode;
+
+	mode = S_IFLNK | S_IRWXUGO;
+	proc_self_symlink = proc_create("self", mode, NULL, NULL );
+	proc_self_symlink->proc_iops = &proc_self_inode_operations;
+}
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
+	.fs_flags	= FS_USERNS_MOUNT,
 };

 int __init sysfs_init(void)
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@ -344,10 +344,8 @@ static inline void put_cred(const struct cred *_cred)
 extern struct user_namespace init_user_ns;
 #ifdef CONFIG_USER_NS
 #define current_user_ns()	(current_cred_xxx(user_ns))
-#define task_user_ns(task)	(task_cred_xxx((task), user_ns))
 #else
 #define current_user_ns()	(&init_user_ns)
-#define task_user_ns(task)	(&init_user_ns)
 #endif


--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@ -1810,6 +1810,8 @@ struct file_system_type {
 #define FS_REQUIRES_DEV		1 
 #define FS_BINARY_MOUNTDATA	2
 #define FS_HAS_SUBTYPE		4
+#define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
+#define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
 #define FS_REVAL_DOT		16384	/* Check the paths ".", ".." for staleness */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@ -67,6 +67,8 @@ struct ipc_namespace {

 	/* user_ns which owns the ipc ns */
 	struct user_namespace *user_ns;
+
+	unsigned int	proc_inum;
 };

 extern struct ipc_namespace init_ipc_ns;
@ -133,7 +135,8 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }

 #if defined(CONFIG_IPC_NS)
 extern struct ipc_namespace *copy_ipcs(unsigned long flags,
-				       struct task_struct *tsk);
+	struct user_namespace *user_ns, struct ipc_namespace *ns);
+
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
@ -144,12 +147,12 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 extern void put_ipc_ns(struct ipc_namespace *ns);
 #else
 static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
-					      struct task_struct *tsk)
+	struct user_namespace *user_ns, struct ipc_namespace *ns)
 {
 	if (flags & CLONE_NEWIPC)
 		return ERR_PTR(-EINVAL);

-	return tsk->nsproxy->ipc_ns;
+	return ns;
 }

 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@ -4,9 +4,10 @@

 struct mnt_namespace;
 struct fs_struct;
+struct user_namespace;

 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
-		struct fs_struct *);
+		struct user_namespace *, struct fs_struct *);
 extern void put_mnt_ns(struct mnt_namespace *ns);

 extern const struct file_operations proc_mounts_operations;
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@ -67,7 +67,7 @@ void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 void free_nsproxy(struct nsproxy *ns);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
-	struct fs_struct *);
+	struct cred *, struct fs_struct *);
 int __init nsproxy_cache_init(void);

 static inline void put_nsproxy(struct nsproxy *ns)
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@ -21,6 +21,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct pidmap pidmap[PIDMAP_ENTRIES];
 	int last_pid;
+	int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
@ -31,9 +32,12 @@ struct pid_namespace {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
 #endif
+	struct user_namespace *user_ns;
+	struct work_struct proc_work;
 	kgid_t pid_gid;
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */
+	unsigned int proc_inum;
 };

 extern struct pid_namespace init_pid_ns;
@ -46,7 +50,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 	return ns;
 }

-extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns);
+extern struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *ns);
 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
 extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
 extern void put_pid_ns(struct pid_namespace *ns);
@ -59,8 +64,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
 	return ns;
 }

-static inline struct pid_namespace *
-copy_pid_ns(unsigned long flags, struct pid_namespace *ns)
+static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *ns)
 {
 	if (flags & CLONE_NEWPID)
 		ns = ERR_PTR(-EINVAL);
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@ -29,6 +29,10 @@ struct mm_struct;

 enum {
 	PROC_ROOT_INO		= 1,
+	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
+	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
+	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
+	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
 };

 /*
@ -174,7 +178,10 @@ extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
 	struct proc_dir_entry *parent);

 extern struct file *proc_ns_fget(int fd);
+extern bool proc_ns_inode(struct inode *inode);

+extern int proc_alloc_inum(unsigned int *pino);
+extern void proc_free_inum(unsigned int inum);
 #else

 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
@ -229,6 +236,19 @@ static inline struct file *proc_ns_fget(int fd)
 	return ERR_PTR(-EINVAL);
 }

+static inline bool proc_ns_inode(struct inode *inode)
+{
+	return false;
+}
+
+static inline int proc_alloc_inum(unsigned int *inum)
+{
+	*inum = 1;
+	return 0;
+}
+static inline void proc_free_inum(unsigned int inum)
+{
+}
 #endif /* CONFIG_PROC_FS */

 #if !defined(CONFIG_PROC_KCORE)
@ -247,10 +267,14 @@ struct proc_ns_operations {
 	void *(*get)(struct task_struct *task);
 	void (*put)(void *ns);
 	int (*install)(struct nsproxy *nsproxy, void *ns);
+	unsigned int (*inum)(void *ns);
 };
 extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations mntns_operations;

 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@ -25,6 +25,7 @@ struct user_namespace {
 	struct user_namespace	*parent;
 	kuid_t			owner;
 	kgid_t			group;
+	unsigned int		proc_inum;
 };

 extern struct user_namespace init_user_ns;
@ -39,6 +40,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 }

 extern int create_user_ns(struct cred *new);
+extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
 extern void free_user_ns(struct kref *kref);

 static inline void put_user_ns(struct user_namespace *ns)
@ -66,6 +68,14 @@ static inline int create_user_ns(struct cred *new)
 	return -EINVAL;
 }

+static inline int unshare_userns(unsigned long unshare_flags,
+				 struct cred **new_cred)
+{
+	if (unshare_flags & CLONE_NEWUSER)
+		return -EINVAL;
+	return 0;
+}
+
 static inline void put_user_ns(struct user_namespace *ns)
 {
 }
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@ -23,6 +23,7 @@ struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
 	struct user_namespace *user_ns;
+	unsigned int proc_inum;
 };
 extern struct uts_namespace init_uts_ns;

@ -33,7 +34,7 @@ static inline void get_uts_ns(struct uts_namespace *ns)
 }

 extern struct uts_namespace *copy_utsname(unsigned long flags,
-					  struct task_struct *tsk);
+	struct user_namespace *user_ns, struct uts_namespace *old_ns);
 extern void free_uts_ns(struct kref *kref);

 static inline void put_uts_ns(struct uts_namespace *ns)
@ -50,12 +51,12 @@ static inline void put_uts_ns(struct uts_namespace *ns)
 }

 static inline struct uts_namespace *copy_utsname(unsigned long flags,
-						 struct task_struct *tsk)
+	struct user_namespace *user_ns, struct uts_namespace *old_ns)
 {
 	if (flags & CLONE_NEWUTS)
 		return ERR_PTR(-EINVAL);

-	return tsk->nsproxy->uts_ns;
+	return old_ns;
 }
 #endif

--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@ -56,6 +56,8 @@ struct net {

 	struct user_namespace   *user_ns;	/* Owning user namespace */

+	unsigned int		proc_inum;
+
 	struct proc_dir_entry 	*proc_net;
 	struct proc_dir_entry 	*proc_net_stat;

--- a/init/Kconfig
+++ b/init/Kconfig
@ -1069,11 +1069,9 @@ config UIDGID_CONVERTED
 	# Filesystems
 	depends on 9P_FS = n
 	depends on AFS_FS = n
-	depends on AUTOFS4_FS = n
 	depends on CEPH_FS = n
 	depends on CIFS = n
 	depends on CODA_FS = n
-	depends on FUSE_FS = n
 	depends on GFS2_FS = n
 	depends on NCP_FS = n
 	depends on NFSD = n
--- a/init/main.c
+++ b/init/main.c
@ -812,7 +812,6 @@ static int __ref kernel_init(void *unused)
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();

-	current->signal->flags |= SIGNAL_UNKILLABLE;
 	flush_delayed_fput();

 	if (ramdisk_execute_command) {
--- a/init/version.c
+++ b/init/version.c
@ -12,6 +12,7 @@
 #include <linux/utsname.h>
 #include <generated/utsrelease.h>
 #include <linux/version.h>
+#include <linux/proc_fs.h>

 #ifndef CONFIG_KALLSYMS
 #define version(a) Version_ ## a
@ -34,6 +35,7 @@ struct uts_namespace init_uts_ns = {
 		.domainname	= UTS_DOMAINNAME,
 	},
 	.user_ns = &init_user_ns,
+	.proc_inum = PROC_UTS_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);

--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@ -16,6 +16,7 @@
 #include <linux/msg.h>
 #include <linux/ipc_namespace.h>
 #include <linux/utsname.h>
+#include <linux/proc_fs.h>
 #include <asm/uaccess.h>

 #include "util.h"
@ -30,6 +31,7 @@ DEFINE_SPINLOCK(mq_lock);
 struct ipc_namespace init_ipc_ns = {
 	.count		= ATOMIC_INIT(1),
 	.user_ns = &init_user_ns,
+	.proc_inum = PROC_IPC_INIT_INO,
 };

 atomic_t nr_ipc_ns = ATOMIC_INIT(1);
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@ -16,7 +16,7 @@

 #include "util.h"

-static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
+static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 					   struct ipc_namespace *old_ns)
 {
 	struct ipc_namespace *ns;
@ -26,9 +26,16 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
 	if (ns == NULL)
 		return ERR_PTR(-ENOMEM);

+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err) {
+		kfree(ns);
+		return ERR_PTR(err);
+	}
+
 	atomic_set(&ns->count, 1);
 	err = mq_init_ns(ns);
 	if (err) {
+		proc_free_inum(ns->proc_inum);
 		kfree(ns);
 		return ERR_PTR(err);
 	}
@ -46,19 +53,17 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk,
 	ipcns_notify(IPCNS_CREATED);
 	register_ipcns_notifier(ns);

-	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
+	ns->user_ns = get_user_ns(user_ns);

 	return ns;
 }

 struct ipc_namespace *copy_ipcs(unsigned long flags,
-				struct task_struct *tsk)
+	struct user_namespace *user_ns, struct ipc_namespace *ns)
 {
-	struct ipc_namespace *ns = tsk->nsproxy->ipc_ns;
-
 	if (!(flags & CLONE_NEWIPC))
 		return get_ipc_ns(ns);
-	return create_ipc_ns(tsk, ns);
+	return create_ipc_ns(user_ns, ns);
 }

 /*
@ -113,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	 */
 	ipcns_notify(IPCNS_REMOVED);
 	put_user_ns(ns->user_ns);
+	proc_free_inum(ns->proc_inum);
 	kfree(ns);
 }

@ -161,8 +167,12 @@ static void ipcns_put(void *ns)
 	return put_ipc_ns(ns);
 }

-static int ipcns_install(struct nsproxy *nsproxy, void *ns)
+static int ipcns_install(struct nsproxy *nsproxy, void *new)
 {
+	struct ipc_namespace *ns = new;
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* Ditch state from the old ipc namespace */
 	exit_sem(current);
 	put_ipc_ns(nsproxy->ipc_ns);
@ -170,10 +180,18 @@ static int ipcns_install(struct nsproxy *nsproxy, void *ns)
 	return 0;
 }

+static unsigned int ipcns_inum(void *vp)
+{
+	struct ipc_namespace *ns = vp;
+
+	return ns->proc_inum;
+}
+
 const struct proc_ns_operations ipcns_operations = {
 	.name		= "ipc",
 	.type		= CLONE_NEWIPC,
 	.get		= ipcns_get,
 	.put		= ipcns_put,
 	.install	= ipcns_install,
+	.inum		= ipcns_inum,
 };
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@ -3409,7 +3409,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
 	struct cgroup_pidlist *l;
 	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = current->nsproxy->pid_ns;
+	struct pid_namespace *ns = task_active_pid_ns(current);

 	/*
 	 * We can't drop the pidlist_mutex before taking the l->mutex in case
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,

 	event->parent		= parent_event;

-	event->ns		= get_pid_ns(current->nsproxy->pid_ns);
+	event->ns		= get_pid_ns(task_active_pid_ns(current));
 	event->id		= atomic64_inc_return(&perf_event_id);

 	event->state		= PERF_EVENT_STATE_INACTIVE;
--- a/kernel/exit.c
+++ b/kernel/exit.c
@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
-		/*
-		 * If we are the last child process in a pid namespace to be
-		 * reaped, notify the reaper sleeping zap_pid_ns_processes().
-		 */
-		if (IS_ENABLED(CONFIG_PID_NS)) {
-			struct task_struct *parent = p->real_parent;
-
-			if ((task_active_pid_ns(parent)->child_reaper == parent) &&
-			    list_empty(&parent->children) &&
-			    (parent->flags & PF_EXITING))
-				wake_up_process(parent);
-		}
 	}
 	list_del_rcu(&p->thread_group);
 }
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -1044,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	atomic_set(&sig->live, 1);
 	atomic_set(&sig->sigcnt, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
-	if (clone_flags & CLONE_NEWPID)
-		sig->flags |= SIGNAL_UNKILLABLE;
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
@ -1438,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

 		if (thread_group_leader(p)) {
-			if (is_child_reaper(pid))
-				p->nsproxy->pid_ns->child_reaper = p;
+			if (is_child_reaper(pid)) {
+				ns_of_pid(pid)->child_reaper = p;
+				p->signal->flags |= SIGNAL_UNKILLABLE;
+			}

 			p->signal->leader_pid = pid;
 			p->signal->tty = tty_kref_get(current->signal->tty);
@ -1473,8 +1473,6 @@ bad_fork_cleanup_io:
 	if (p->io_context)
 		exit_io_context(p);
 bad_fork_cleanup_namespaces:
-	if (unlikely(clone_flags & CLONE_NEWPID))
-		pid_ns_release_proc(p->nsproxy->pid_ns);
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
@ -1554,15 +1552,9 @@ long do_fork(unsigned long clone_flags,
 	 * Do some preliminary argument and permissions checking before we
 	 * actually start allocating stuff
 	 */
-	if (clone_flags & CLONE_NEWUSER) {
-		if (clone_flags & CLONE_THREAD)
+	if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+		if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
 			return -EINVAL;
-		/* hopefully this check will go away when userns support is
-		 * complete
-		 */
-		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
-				!capable(CAP_SETGID))
-			return -EPERM;
 	}

 	/*
@ -1724,7 +1716,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
 {
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+				CLONE_NEWUSER|CLONE_NEWPID))
 		return -EINVAL;
 	/*
 	 * Not implemented, but pretend it works if there is nothing to
@ -1791,19 +1784,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
 	struct fs_struct *fs, *new_fs = NULL;
 	struct files_struct *fd, *new_fd = NULL;
+	struct cred *new_cred = NULL;
 	struct nsproxy *new_nsproxy = NULL;
 	int do_sysvsem = 0;
 	int err;

-	err = check_unshare_flags(unshare_flags);
-	if (err)
-		goto bad_unshare_out;
-
+	/*
+	 * If unsharing a user namespace must also unshare the thread.
+	 */
+	if (unshare_flags & CLONE_NEWUSER)
+		unshare_flags |= CLONE_THREAD;
+	/*
+	 * If unsharing a pid namespace must also unshare the thread.
+	 */
+	if (unshare_flags & CLONE_NEWPID)
+		unshare_flags |= CLONE_THREAD;
+	/*
+	 * If unsharing a thread from a thread group, must also unshare vm.
+	 */
+	if (unshare_flags & CLONE_THREAD)
+		unshare_flags |= CLONE_VM;
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (unshare_flags & CLONE_VM)
+		unshare_flags |= CLONE_SIGHAND;
 	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
+
+	err = check_unshare_flags(unshare_flags);
+	if (err)
+		goto bad_unshare_out;
 	/*
 	 * CLONE_NEWIPC must also detach from the undolist: after switching
 	 * to a new ipc namespace, the semaphore arrays from the old
@ -1817,11 +1831,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	err = unshare_fd(unshare_flags, &new_fd);
 	if (err)
 		goto bad_unshare_cleanup_fs;
-	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+	err = unshare_userns(unshare_flags, &new_cred);
 	if (err)
 		goto bad_unshare_cleanup_fd;
+	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+					 new_cred, new_fs);
+	if (err)
+		goto bad_unshare_cleanup_cred;

-	if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+	if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
 		if (do_sysvsem) {
 			/*
 			 * CLONE_SYSVSEM is equivalent to sys_exit().
@ -1854,11 +1872,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 		}

 		task_unlock(current);
+
+		if (new_cred) {
+			/* Install the new user namespace */
+			commit_creds(new_cred);
+			new_cred = NULL;
+		}
 	}

 	if (new_nsproxy)
 		put_nsproxy(new_nsproxy);

+bad_unshare_cleanup_cred:
+	if (new_cred)
+		put_cred(new_cred);
 bad_unshare_cleanup_fd:
 	if (new_fd)
 		put_files_struct(new_fd);
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
 * leave it to the caller to do proper locking and attach it to task.
 */
 static struct nsproxy *create_new_namespaces(unsigned long flags,
-			struct task_struct *tsk, struct fs_struct *new_fs)
+	struct task_struct *tsk, struct user_namespace *user_ns,
+	struct fs_struct *new_fs)
 {
 	struct nsproxy *new_nsp;
 	int err;
@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	if (!new_nsp)
 		return ERR_PTR(-ENOMEM);

-	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
 	if (IS_ERR(new_nsp->mnt_ns)) {
 		err = PTR_ERR(new_nsp->mnt_ns);
 		goto out_ns;
 	}

-	new_nsp->uts_ns = copy_utsname(flags, tsk);
+	new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
 	if (IS_ERR(new_nsp->uts_ns)) {
 		err = PTR_ERR(new_nsp->uts_ns);
 		goto out_uts;
 	}

-	new_nsp->ipc_ns = copy_ipcs(flags, tsk);
+	new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
 	if (IS_ERR(new_nsp->ipc_ns)) {
 		err = PTR_ERR(new_nsp->ipc_ns);
 		goto out_ipc;
 	}

-	new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+	new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
 	if (IS_ERR(new_nsp->pid_ns)) {
 		err = PTR_ERR(new_nsp->pid_ns);
 		goto out_pid;
 	}

-	new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns);
+	new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
 		goto out_net;
@ -122,6 +123,7 @@ out_ns:
 int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 {
 	struct nsproxy *old_ns = tsk->nsproxy;
+	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
 	int err = 0;

@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;

-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
 		err = -EPERM;
 		goto out;
 	}
@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		goto out;
 	}

-	new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+	new_ns = create_new_namespaces(flags, tsk,
+				       task_cred_xxx(tsk, user_ns), tsk->fs);
 	if (IS_ERR(new_ns)) {
 		err = PTR_ERR(new_ns);
 		goto out;
@ -183,18 +186,20 @@ void free_nsproxy(struct nsproxy *ns)
 * On success, returns the new nsproxy.
 */
 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
-		struct nsproxy **new_nsp, struct fs_struct *new_fs)
+	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
 {
+	struct user_namespace *user_ns;
 	int err = 0;

 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWNET)))
+			       CLONE_NEWNET | CLONE_NEWPID)))
 		return 0;

-	if (!capable(CAP_SYS_ADMIN))
+	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;

-	*new_nsp = create_new_namespaces(unshare_flags, current,
+	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
 					 new_fs ? new_fs : current->fs);
 	if (IS_ERR(*new_nsp)) {
 		err = PTR_ERR(*new_nsp);
@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	struct file *file;
 	int err;

-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	file = proc_ns_fget(fd);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	if (nstype && (ops->type != nstype))
 		goto out;

-	new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+	new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
 	if (IS_ERR(new_nsproxy)) {
 		err = PTR_ERR(new_nsproxy);
 		goto out;
--- a/kernel/pid.c
+++ b/kernel/pid.c
@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/proc_fs.h>

 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@ -78,6 +79,8 @@ struct pid_namespace init_pid_ns = {
 	.last_pid = 0,
 	.level = 0,
 	.child_reaper = &init_task,
+	.user_ns = &init_user_ns,
+	.proc_inum = PROC_PID_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);

@ -269,8 +272,24 @@ void free_pid(struct pid *pid)
 	unsigned long flags;

 	spin_lock_irqsave(&pidmap_lock, flags);
-	for (i = 0; i <= pid->level; i++)
-		hlist_del_rcu(&pid->numbers[i].pid_chain);
+	for (i = 0; i <= pid->level; i++) {
+		struct upid *upid = pid->numbers + i;
+		struct pid_namespace *ns = upid->ns;
+		hlist_del_rcu(&upid->pid_chain);
+		switch(--ns->nr_hashed) {
+		case 1:
+			/* When all that is left in the pid namespace
+			 * is the reaper wake up the reaper.  The reaper
+			 * may be sleeping in zap_pid_ns_processes().
+			 */
+			wake_up_process(ns->child_reaper);
+			break;
+		case 0:
+			ns->nr_hashed = -1;
+			schedule_work(&ns->proc_work);
+			break;
+		}
+	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);

 	for (i = 0; i <= pid->level; i++)
@ -292,6 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 		goto out;

 	tmp = ns;
+	pid->level = ns->level;
 	for (i = ns->level; i >= 0; i--) {
 		nr = alloc_pidmap(tmp);
 		if (nr < 0)
@ -302,22 +322,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 		tmp = tmp->parent;
 	}

+	if (unlikely(is_child_reaper(pid))) {
+		if (pid_ns_prepare_proc(ns))
+			goto out_free;
+	}
+
 	get_pid_ns(ns);
-	pid->level = ns->level;
 	atomic_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		INIT_HLIST_HEAD(&pid->tasks[type]);

 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	for ( ; upid >= pid->numbers; --upid)
+	if (ns->nr_hashed < 0)
+		goto out_unlock;
+	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+		upid->ns->nr_hashed++;
+	}
 	spin_unlock_irq(&pidmap_lock);

 out:
 	return pid;

+out_unlock:
+	spin_unlock(&pidmap_lock);
 out_free:
 	while (++i <= ns->level)
 		free_pidmap(pid->numbers + i);
@ -344,7 +374,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);

 struct pid *find_vpid(int nr)
 {
-	return find_pid_ns(nr, current->nsproxy->pid_ns);
+	return find_pid_ns(nr, task_active_pid_ns(current));
 }
 EXPORT_SYMBOL_GPL(find_vpid);

@ -428,7 +458,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)

 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
-	return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+	return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
 }

 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@ -483,7 +513,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);

 pid_t pid_vnr(struct pid *pid)
 {
-	return pid_nr_ns(pid, current->nsproxy->pid_ns);
+	return pid_nr_ns(pid, task_active_pid_ns(current));
 }
 EXPORT_SYMBOL_GPL(pid_vnr);

@ -494,7 +524,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,

 	rcu_read_lock();
 	if (!ns)
-		ns = current->nsproxy->pid_ns;
+		ns = task_active_pid_ns(current);
 	if (likely(pid_alive(task))) {
 		if (type != PIDTYPE_PID)
 			task = task->group_leader;
@ -569,6 +599,7 @@ void __init pidmap_init(void)
 	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, init_pid_ns.pidmap[0].page);
 	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+	init_pid_ns.nr_hashed = 1;

 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@ -10,6 +10,7 @@

 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
@ -71,10 +72,17 @@ err_alloc:
 	return NULL;
 }

+static void proc_cleanup_work(struct work_struct *work)
+{
+	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+	pid_ns_release_proc(ns);
+}
+
 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
 #define MAX_PID_NS_LEVEL 32

-static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
+static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
+	struct pid_namespace *parent_pid_ns)
 {
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
@ -99,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 	if (ns->pid_cachep == NULL)
 		goto out_free_map;

+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err)
+		goto out_free_map;
+
 	kref_init(&ns->kref);
 	ns->level = level;
 	ns->parent = get_pid_ns(parent_pid_ns);
+	ns->user_ns = get_user_ns(user_ns);
+	INIT_WORK(&ns->proc_work, proc_cleanup_work);

 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@ -109,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 	for (i = 1; i < PIDMAP_ENTRIES; i++)
 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);

-	err = pid_ns_prepare_proc(ns);
-	if (err)
-		goto out_put_parent_pid_ns;
-
 	return ns;

-out_put_parent_pid_ns:
-	put_pid_ns(parent_pid_ns);
 out_free_map:
 	kfree(ns->pidmap[0].page);
 out_free:
@ -129,18 +137,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 {
 	int i;

+	proc_free_inum(ns->proc_inum);
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
+	put_user_ns(ns->user_ns);
 	kmem_cache_free(pid_ns_cachep, ns);
 }

-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
 	if (!(flags & CLONE_NEWPID))
 		return get_pid_ns(old_ns);
-	if (flags & (CLONE_THREAD|CLONE_PARENT))
+	if (task_active_pid_ns(current) != old_ns)
 		return ERR_PTR(-EINVAL);
-	return create_pid_namespace(old_ns);
+	return create_pid_namespace(user_ns, old_ns);
 }

 static void free_pid_ns(struct kref *kref)
@ -211,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)

 	/*
 	 * sys_wait4() above can't reap the TASK_DEAD children.
-	 * Make sure they all go away, see __unhash_process().
+	 * Make sure they all go away, see free_pid().
 	 */
 	for (;;) {
-		bool need_wait = false;
-
-		read_lock(&tasklist_lock);
-		if (!list_empty(&current->children)) {
-			__set_current_state(TASK_UNINTERRUPTIBLE);
-			need_wait = true;
-		}
-		read_unlock(&tasklist_lock);
-
-		if (!need_wait)
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (pid_ns->nr_hashed == 1)
 			break;
 		schedule();
 	}
+	__set_current_state(TASK_RUNNING);

 	if (pid_ns->reboot)
 		current->signal->group_exit_code = pid_ns->reboot;
@ -239,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;

-	if (write && !capable(CAP_SYS_ADMIN))
+	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;

 	/*
@ -250,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 * it should synchronize its usage with external means.
 	 */

-	tmp.data = &current->nsproxy->pid_ns->last_pid;
+	tmp.data = &pid_ns->last_pid;
 	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }

@ -299,6 +304,67 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 	return 0;
 }

+static void *pidns_get(struct task_struct *task)
+{
+	struct pid_namespace *ns;
+
+	rcu_read_lock();
+	ns = get_pid_ns(task_active_pid_ns(task));
+	rcu_read_unlock();
+
+	return ns;
+}
+
+static void pidns_put(void *ns)
+{
+	put_pid_ns(ns);
+}
+
+static int pidns_install(struct nsproxy *nsproxy, void *ns)
+{
+	struct pid_namespace *active = task_active_pid_ns(current);
+	struct pid_namespace *ancestor, *new = ns;
+
+	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/*
+	 * Only allow entering the current active pid namespace
+	 * or a child of the current active pid namespace.
+	 *
+	 * This is required for fork to return a usable pid value and
+	 * this maintains the property that processes and their
+	 * children can not escape their current pid namespace.
+	 */
+	if (new->level < active->level)
+		return -EINVAL;
+
+	ancestor = new;
+	while (ancestor->level > active->level)
+		ancestor = ancestor->parent;
+	if (ancestor != active)
+		return -EINVAL;
+
+	put_pid_ns(nsproxy->pid_ns);
+	nsproxy->pid_ns = get_pid_ns(new);
+	return 0;
+}
+
+static unsigned int pidns_inum(void *ns)
+{
+	struct pid_namespace *pid_ns = ns;
+	return pid_ns->proc_inum;
+}
+
+const struct proc_ns_operations pidns_operations = {
+	.name		= "pid",
+	.type		= CLONE_NEWPID,
+	.get		= pidns_get,
+	.put		= pidns_put,
+	.install	= pidns_install,
+	.inum		= pidns_inum,
+};
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@ -215,8 +215,12 @@ ok:
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
-	if (!dumpable  && !ptrace_has_cap(task_user_ns(task), mode))
+	rcu_read_lock();
+	if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();

 	return security_ptrace_access_check(task, mode);
 }
@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request,

 	if (seize)
 		flags |= PT_SEIZED;
-	if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+	rcu_read_lock();
+	if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
 		flags |= PT_PTRACE_CAP;
+	rcu_read_unlock();
 	task->ptrace = flags;

 	__ptrace_link(task, current);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -4097,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
-	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
+	if (!check_same_owner(p)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
 			goto out_unlock;
+		}
+		rcu_read_unlock();
+	}

 	retval = security_task_setscheduler(p);
 	if (retval)
--- a/kernel/signal.c
+++ b/kernel/signal.c
@ -1753,7 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	 * see comment in do_notify_parent() about the following 4 lines
 	 */
 	rcu_read_lock();
-	info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+	info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
 	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
 	rcu_read_unlock();

--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 		goto out_putname;
 	}

-	mnt = current->nsproxy->pid_ns->proc_mnt;
+	mnt = task_active_pid_ns(current)->proc_mnt;
 	file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
 	result = PTR_ERR(file);
 	if (IS_ERR(file))
--- a/kernel/user.c
+++ b/kernel/user.c
@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>

 /*
 * userns count is 1 for root user, 1 for init_uts_ns,
@ -51,6 +52,7 @@ struct user_namespace init_user_ns = {
 	},
 	.owner = GLOBAL_ROOT_UID,
 	.group = GLOBAL_ROOT_GID,
+	.proc_inum = PROC_USER_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);

--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@ -9,6 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/highuid.h>
 #include <linux/cred.h>
 #include <linux/securebits.h>
@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *map);

+static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
+{
+	/* Start with the same capabilities as init but useless for doing
+	 * anything as the capabilities are bound to the new user namespace.
+	 */
+	cred->securebits = SECUREBITS_DEFAULT;
+	cred->cap_inheritable = CAP_EMPTY_SET;
+	cred->cap_permitted = CAP_FULL_SET;
+	cred->cap_effective = CAP_FULL_SET;
+	cred->cap_bset = CAP_FULL_SET;
+#ifdef CONFIG_KEYS
+	key_put(cred->request_key_auth);
+	cred->request_key_auth = NULL;
+#endif
+	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+	cred->user_ns = user_ns;
+}
+
 /*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
@ -39,6 +58,7 @@ int create_user_ns(struct cred *new)
 	struct user_namespace *ns, *parent_ns = new->user_ns;
 	kuid_t owner = new->euid;
 	kgid_t group = new->egid;
+	int ret;

 	/* The creator needs a mapping in the parent user namespace
 	 * or else we won't be able to reasonably tell userspace who
@ -52,38 +72,45 @@ int create_user_ns(struct cred *new)
 	if (!ns)
 		return -ENOMEM;

+	ret = proc_alloc_inum(&ns->proc_inum);
+	if (ret) {
+		kmem_cache_free(user_ns_cachep, ns);
+		return ret;
+	}
+
 	kref_init(&ns->kref);
+	/* Leave the new->user_ns reference with the new user namespace. */
 	ns->parent = parent_ns;
 	ns->owner = owner;
 	ns->group = group;

-	/* Start with the same capabilities as init but useless for doing
-	 * anything as the capabilities are bound to the new user namespace.
-	 */
-	new->securebits = SECUREBITS_DEFAULT;
-	new->cap_inheritable = CAP_EMPTY_SET;
-	new->cap_permitted = CAP_FULL_SET;
-	new->cap_effective = CAP_FULL_SET;
-	new->cap_bset = CAP_FULL_SET;
-#ifdef CONFIG_KEYS
-	key_put(new->request_key_auth);
-	new->request_key_auth = NULL;
-#endif
-	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-
-	/* Leave the new->user_ns reference with the new user namespace. */
-	/* Leave the reference to our user_ns with the new cred. */
-	new->user_ns = ns;
+	set_cred_user_ns(new, ns);

 	return 0;
 }

+int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
+{
+	struct cred *cred;
+
+	if (!(unshare_flags & CLONE_NEWUSER))
+		return 0;
+
+	cred = prepare_creds();
+	if (!cred)
+		return -ENOMEM;
+
+	*new_cred = cred;
+	return create_user_ns(cred);
+}
+
 void free_user_ns(struct kref *kref)
 {
 	struct user_namespace *parent, *ns =
 		container_of(kref, struct user_namespace, kref);

 	parent = ns->parent;
+	proc_free_inum(ns->proc_inum);
 	kmem_cache_free(user_ns_cachep, ns);
 	put_user_ns(parent);
 }
@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
 	struct user_namespace *lower_ns;
 	uid_t lower;

-	lower_ns = current_user_ns();
+	lower_ns = seq_user_ns(seq);
 	if ((lower_ns == ns) && lower_ns->parent)
 		lower_ns = lower_ns->parent;

@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
 	struct user_namespace *lower_ns;
 	gid_t lower;

-	lower_ns = current_user_ns();
+	lower_ns = seq_user_ns(seq);
 	if ((lower_ns == ns) && lower_ns->parent)
 		lower_ns = lower_ns->parent;

@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
 {
 	struct seq_file *seq = file->private_data;
 	struct user_namespace *ns = seq->private;
+	struct user_namespace *seq_ns = seq_user_ns(seq);

 	if (!ns->parent)
 		return -EPERM;

+	if ((seq_ns != ns) && (seq_ns != ns->parent))
+		return -EPERM;
+
 	return map_write(file, buf, size, ppos, CAP_SETUID,
 			 &ns->uid_map, &ns->parent->uid_map);
 }
@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
 {
 	struct seq_file *seq = file->private_data;
 	struct user_namespace *ns = seq->private;
+	struct user_namespace *seq_ns = seq_user_ns(seq);

 	if (!ns->parent)
 		return -EPERM;

+	if ((seq_ns != ns) && (seq_ns != ns->parent))
+		return -EPERM;
+
 	return map_write(file, buf, size, ppos, CAP_SETGID,
 			 &ns->gid_map, &ns->parent->gid_map);
 }
@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 				struct uid_gid_map *new_map)
 {
+	/* Allow mapping to your own filesystem ids */
+	if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+		u32 id = new_map->extent[0].lower_first;
+		if (cap_setid == CAP_SETUID) {
+			kuid_t uid = make_kuid(ns->parent, id);
+			if (uid_eq(uid, current_fsuid()))
+				return true;
+		}
+		else if (cap_setid == CAP_SETGID) {
+			kgid_t gid = make_kgid(ns->parent, id);
+			if (gid_eq(gid, current_fsgid()))
+				return true;
+		}
+	}
+
 	/* Allow anyone to set a mapping that doesn't require privilege */
 	if (!cap_valid(cap_setid))
 		return true;
@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
 	return false;
 }

+static void *userns_get(struct task_struct *task)
+{
+	struct user_namespace *user_ns;
+
+	rcu_read_lock();
+	user_ns = get_user_ns(__task_cred(task)->user_ns);
+	rcu_read_unlock();
+
+	return user_ns;
+}
+
+static void userns_put(void *ns)
+{
+	put_user_ns(ns);
+}
+
+static int userns_install(struct nsproxy *nsproxy, void *ns)
+{
+	struct user_namespace *user_ns = ns;
+	struct cred *cred;
+
+	/* Don't allow gaining capabilities by reentering
+	 * the same user namespace.
+	 */
+	if (user_ns == current_user_ns())
+		return -EINVAL;
+
+	/* Threaded many not enter a different user namespace */
+	if (atomic_read(&current->mm->mm_users) > 1)
+		return -EINVAL;
+
+	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	cred = prepare_creds();
+	if (!cred)
+		return -ENOMEM;
+
+	put_user_ns(cred->user_ns);
+	set_cred_user_ns(cred, get_user_ns(user_ns));
+
+	return commit_creds(cred);
+}
+
+static unsigned int userns_inum(void *ns)
+{
+	struct user_namespace *user_ns = ns;
+	return user_ns->proc_inum;
+}
+
+const struct proc_ns_operations userns_operations = {
+	.name		= "user",
+	.type		= CLONE_NEWUSER,
+	.get		= userns_get,
+	.put		= userns_put,
+	.install	= userns_install,
+	.inum		= userns_inum,
+};
+
 static __init int user_namespaces_init(void)
 {
 	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void)
 * @old_ns: namespace to clone
 * Return NULL on error (failure to kmalloc), new ns otherwise
 */
-static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 					  struct uts_namespace *old_ns)
 {
 	struct uts_namespace *ns;
+	int err;

 	ns = create_uts_ns();
 	if (!ns)
 		return ERR_PTR(-ENOMEM);

+	err = proc_alloc_inum(&ns->proc_inum);
+	if (err) {
+		kfree(ns);
+		return ERR_PTR(err);
+	}
+
 	down_read(&uts_sem);
 	memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
-	ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
+	ns->user_ns = get_user_ns(user_ns);
 	up_read(&uts_sem);
 	return ns;
 }
@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
 * versa.
 */
 struct uts_namespace *copy_utsname(unsigned long flags,
-				   struct task_struct *tsk)
+	struct user_namespace *user_ns, struct uts_namespace *old_ns)
 {
-	struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
 	struct uts_namespace *new_ns;

 	BUG_ON(!old_ns);
@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
 	if (!(flags & CLONE_NEWUTS))
 		return old_ns;

-	new_ns = clone_uts_ns(tsk, old_ns);
+	new_ns = clone_uts_ns(user_ns, old_ns);

 	put_uts_ns(old_ns);
 	return new_ns;
@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)

 	ns = container_of(kref, struct uts_namespace, kref);
 	put_user_ns(ns->user_ns);
+	proc_free_inum(ns->proc_inum);
 	kfree(ns);
 }

@ -102,19 +109,31 @@ static void utsns_put(void *ns)
 	put_uts_ns(ns);
 }

-static int utsns_install(struct nsproxy *nsproxy, void *ns)
+static int utsns_install(struct nsproxy *nsproxy, void *new)
 {
+	struct uts_namespace *ns = new;
+
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
 	get_uts_ns(ns);
 	put_uts_ns(nsproxy->uts_ns);
 	nsproxy->uts_ns = ns;
 	return 0;
 }

+static unsigned int utsns_inum(void *vp)
+{
+	struct uts_namespace *ns = vp;
+
+	return ns->proc_inum;
+}
+
 const struct proc_ns_operations utsns_operations = {
 	.name		= "uts",
 	.type		= CLONE_NEWUTS,
 	.get		= utsns_get,
 	.put		= utsns_put,
 	.install	= utsns_install,
+	.inum		= utsns_inum,
 };
-
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@ -381,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid)
 }
 EXPORT_SYMBOL_GPL(get_net_ns_by_pid);

+static __net_init int net_ns_net_init(struct net *net)
+{
+	return proc_alloc_inum(&net->proc_inum);
+}
+
+static __net_exit void net_ns_net_exit(struct net *net)
+{
+	proc_free_inum(net->proc_inum);
+}
+
+static struct pernet_operations __net_initdata net_ns_ops = {
+	.init = net_ns_net_init,
+	.exit = net_ns_net_exit,
+};
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@ -412,6 +427,8 @@ static int __init net_ns_init(void)

 	mutex_unlock(&net_mutex);

+	register_pernet_subsys(&net_ns_ops);
+
 	return 0;
 }

@ -630,16 +647,28 @@ static void netns_put(void *ns)

 static int netns_install(struct nsproxy *nsproxy, void *ns)
 {
+	struct net *net = ns;
+
+	if (!ns_capable(net->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
 	put_net(nsproxy->net_ns);
-	nsproxy->net_ns = get_net(ns);
+	nsproxy->net_ns = get_net(net);
 	return 0;
 }

+static unsigned int netns_inum(void *ns)
+{
+	struct net *net = ns;
+	return net->proc_inum;
+}
+
 const struct proc_ns_operations netns_operations = {
 	.name		= "net",
 	.type		= CLONE_NEWNET,
 	.get		= netns_get,
 	.put		= netns_put,
 	.install	= netns_install,
+	.inum		= netns_inum,
 };
 #endif
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@ -298,14 +298,18 @@ int yama_ptrace_access_check(struct task_struct *child,
 			/* No additional restrictions. */
 			break;
 		case YAMA_SCOPE_RELATIONAL:
+			rcu_read_lock();
 			if (!task_is_descendant(current, child) &&
 			    !ptracer_exception_found(current, child) &&
-			    !ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+			    !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
 				rc = -EPERM;
+			rcu_read_unlock();
 			break;
 		case YAMA_SCOPE_CAPABILITY:
-			if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+			rcu_read_lock();
+			if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
 				rc = -EPERM;
+			rcu_read_unlock();
 			break;
 		case YAMA_SCOPE_NO_ATTACH:
 		default:
@ -343,8 +347,10 @@ int yama_ptrace_traceme(struct task_struct *parent)
 	/* Only disallow PTRACE_TRACEME on more aggressive settings. */
 	switch (ptrace_scope) {
 	case YAMA_SCOPE_CAPABILITY:
-		if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE))
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE))
 			rc = -EPERM;
+		rcu_read_unlock();
 		break;
 	case YAMA_SCOPE_NO_ATTACH:
 		rc = -EPERM;