pidns: Make the pidns proc mount/umount logic obvious.
Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" <serge@hallyn.com> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
This commit is contained in:
		
							parent
							
								
									17cf22c33e
								
							
						
					
					
						commit
						0a01f2cc39
					
				| @ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task) | |||||||
| 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, | 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, | ||||||
| 					tgid->numbers[i].nr); | 					tgid->numbers[i].nr); | ||||||
| 	} | 	} | ||||||
| 
 |  | ||||||
| 	upid = &pid->numbers[pid->level]; |  | ||||||
| 	if (upid->nr == 1) |  | ||||||
| 		pid_ns_release_proc(upid->ns); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static struct dentry *proc_pid_instantiate(struct inode *dir, | static struct dentry *proc_pid_instantiate(struct inode *dir, | ||||||
|  | |||||||
| @ -155,11 +155,6 @@ void __init proc_root_init(void) | |||||||
| 	err = register_filesystem(&proc_fs_type); | 	err = register_filesystem(&proc_fs_type); | ||||||
| 	if (err) | 	if (err) | ||||||
| 		return; | 		return; | ||||||
| 	err = pid_ns_prepare_proc(&init_pid_ns); |  | ||||||
| 	if (err) { |  | ||||||
| 		unregister_filesystem(&proc_fs_type); |  | ||||||
| 		return; |  | ||||||
| 	} |  | ||||||
| 
 | 
 | ||||||
| 	proc_self_init(); | 	proc_self_init(); | ||||||
| 	proc_symlink("mounts", NULL, "self/mounts"); | 	proc_symlink("mounts", NULL, "self/mounts"); | ||||||
|  | |||||||
| @ -21,6 +21,7 @@ struct pid_namespace { | |||||||
| 	struct kref kref; | 	struct kref kref; | ||||||
| 	struct pidmap pidmap[PIDMAP_ENTRIES]; | 	struct pidmap pidmap[PIDMAP_ENTRIES]; | ||||||
| 	int last_pid; | 	int last_pid; | ||||||
|  | 	int nr_hashed; | ||||||
| 	struct task_struct *child_reaper; | 	struct task_struct *child_reaper; | ||||||
| 	struct kmem_cache *pid_cachep; | 	struct kmem_cache *pid_cachep; | ||||||
| 	unsigned int level; | 	unsigned int level; | ||||||
| @ -32,6 +33,7 @@ struct pid_namespace { | |||||||
| 	struct bsd_acct_struct *bacct; | 	struct bsd_acct_struct *bacct; | ||||||
| #endif | #endif | ||||||
| 	struct user_namespace *user_ns; | 	struct user_namespace *user_ns; | ||||||
|  | 	struct work_struct proc_work; | ||||||
| 	kgid_t pid_gid; | 	kgid_t pid_gid; | ||||||
| 	int hide_pid; | 	int hide_pid; | ||||||
| 	int reboot;	/* group exit code if this pidns was rebooted */ | 	int reboot;	/* group exit code if this pidns was rebooted */ | ||||||
|  | |||||||
| @ -1476,8 +1476,6 @@ bad_fork_cleanup_io: | |||||||
| 	if (p->io_context) | 	if (p->io_context) | ||||||
| 		exit_io_context(p); | 		exit_io_context(p); | ||||||
| bad_fork_cleanup_namespaces: | bad_fork_cleanup_namespaces: | ||||||
| 	if (unlikely(clone_flags & CLONE_NEWPID)) |  | ||||||
| 		pid_ns_release_proc(p->nsproxy->pid_ns); |  | ||||||
| 	exit_task_namespaces(p); | 	exit_task_namespaces(p); | ||||||
| bad_fork_cleanup_mm: | bad_fork_cleanup_mm: | ||||||
| 	if (p->mm) | 	if (p->mm) | ||||||
|  | |||||||
							
								
								
									
										21
									
								
								kernel/pid.c
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								kernel/pid.c
									
									
									
									
									
								
							| @ -36,6 +36,7 @@ | |||||||
| #include <linux/pid_namespace.h> | #include <linux/pid_namespace.h> | ||||||
| #include <linux/init_task.h> | #include <linux/init_task.h> | ||||||
| #include <linux/syscalls.h> | #include <linux/syscalls.h> | ||||||
|  | #include <linux/proc_fs.h> | ||||||
| 
 | 
 | ||||||
| #define pid_hashfn(nr, ns)	\ | #define pid_hashfn(nr, ns)	\ | ||||||
| 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) | 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) | ||||||
| @ -270,8 +271,12 @@ void free_pid(struct pid *pid) | |||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
| 
 | 
 | ||||||
| 	spin_lock_irqsave(&pidmap_lock, flags); | 	spin_lock_irqsave(&pidmap_lock, flags); | ||||||
| 	for (i = 0; i <= pid->level; i++) | 	for (i = 0; i <= pid->level; i++) { | ||||||
| 		hlist_del_rcu(&pid->numbers[i].pid_chain); | 		struct upid *upid = pid->numbers + i; | ||||||
|  | 		hlist_del_rcu(&upid->pid_chain); | ||||||
|  | 		if (--upid->ns->nr_hashed == 0) | ||||||
|  | 			schedule_work(&upid->ns->proc_work); | ||||||
|  | 	} | ||||||
| 	spin_unlock_irqrestore(&pidmap_lock, flags); | 	spin_unlock_irqrestore(&pidmap_lock, flags); | ||||||
| 
 | 
 | ||||||
| 	for (i = 0; i <= pid->level; i++) | 	for (i = 0; i <= pid->level; i++) | ||||||
| @ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 
 | ||||||
| 	tmp = ns; | 	tmp = ns; | ||||||
|  | 	pid->level = ns->level; | ||||||
| 	for (i = ns->level; i >= 0; i--) { | 	for (i = ns->level; i >= 0; i--) { | ||||||
| 		nr = alloc_pidmap(tmp); | 		nr = alloc_pidmap(tmp); | ||||||
| 		if (nr < 0) | 		if (nr < 0) | ||||||
| @ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||||||
| 		tmp = tmp->parent; | 		tmp = tmp->parent; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (unlikely(is_child_reaper(pid))) { | ||||||
|  | 		if (pid_ns_prepare_proc(ns)) | ||||||
|  | 			goto out_free; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	get_pid_ns(ns); | 	get_pid_ns(ns); | ||||||
| 	pid->level = ns->level; |  | ||||||
| 	atomic_set(&pid->count, 1); | 	atomic_set(&pid->count, 1); | ||||||
| 	for (type = 0; type < PIDTYPE_MAX; ++type) | 	for (type = 0; type < PIDTYPE_MAX; ++type) | ||||||
| 		INIT_HLIST_HEAD(&pid->tasks[type]); | 		INIT_HLIST_HEAD(&pid->tasks[type]); | ||||||
| 
 | 
 | ||||||
| 	upid = pid->numbers + ns->level; | 	upid = pid->numbers + ns->level; | ||||||
| 	spin_lock_irq(&pidmap_lock); | 	spin_lock_irq(&pidmap_lock); | ||||||
| 	for ( ; upid >= pid->numbers; --upid) | 	for ( ; upid >= pid->numbers; --upid) { | ||||||
| 		hlist_add_head_rcu(&upid->pid_chain, | 		hlist_add_head_rcu(&upid->pid_chain, | ||||||
| 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]); | 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]); | ||||||
|  | 		upid->ns->nr_hashed++; | ||||||
|  | 	} | ||||||
| 	spin_unlock_irq(&pidmap_lock); | 	spin_unlock_irq(&pidmap_lock); | ||||||
| 
 | 
 | ||||||
| out: | out: | ||||||
| @ -570,6 +582,7 @@ void __init pidmap_init(void) | |||||||
| 	/* Reserve PID 0. We never call free_pidmap(0) */ | 	/* Reserve PID 0. We never call free_pidmap(0) */ | ||||||
| 	set_bit(0, init_pid_ns.pidmap[0].page); | 	set_bit(0, init_pid_ns.pidmap[0].page); | ||||||
| 	atomic_dec(&init_pid_ns.pidmap[0].nr_free); | 	atomic_dec(&init_pid_ns.pidmap[0].nr_free); | ||||||
|  | 	init_pid_ns.nr_hashed = 1; | ||||||
| 
 | 
 | ||||||
| 	init_pid_ns.pid_cachep = KMEM_CACHE(pid, | 	init_pid_ns.pid_cachep = KMEM_CACHE(pid, | ||||||
| 			SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 			SLAB_HWCACHE_ALIGN | SLAB_PANIC); | ||||||
|  | |||||||
| @ -72,6 +72,12 @@ err_alloc: | |||||||
| 	return NULL; | 	return NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void proc_cleanup_work(struct work_struct *work) | ||||||
|  | { | ||||||
|  | 	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); | ||||||
|  | 	pid_ns_release_proc(ns); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ | /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ | ||||||
| #define MAX_PID_NS_LEVEL 32 | #define MAX_PID_NS_LEVEL 32 | ||||||
| 
 | 
 | ||||||
| @ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns | |||||||
| 	ns->level = level; | 	ns->level = level; | ||||||
| 	ns->parent = get_pid_ns(parent_pid_ns); | 	ns->parent = get_pid_ns(parent_pid_ns); | ||||||
| 	ns->user_ns = get_user_ns(user_ns); | 	ns->user_ns = get_user_ns(user_ns); | ||||||
|  | 	INIT_WORK(&ns->proc_work, proc_cleanup_work); | ||||||
| 
 | 
 | ||||||
| 	set_bit(0, ns->pidmap[0].page); | 	set_bit(0, ns->pidmap[0].page); | ||||||
| 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); | 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); | ||||||
| @ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns | |||||||
| 	for (i = 1; i < PIDMAP_ENTRIES; i++) | 	for (i = 1; i < PIDMAP_ENTRIES; i++) | ||||||
| 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); | ||||||
| 
 | 
 | ||||||
| 	err = pid_ns_prepare_proc(ns); |  | ||||||
| 	if (err) |  | ||||||
| 		goto out_put_parent_pid_ns; |  | ||||||
| 
 |  | ||||||
| 	return ns; | 	return ns; | ||||||
| 
 | 
 | ||||||
| out_put_parent_pid_ns: |  | ||||||
| 	put_pid_ns(parent_pid_ns); |  | ||||||
| 	put_user_ns(user_ns); |  | ||||||
| out_free_map: | out_free_map: | ||||||
| 	kfree(ns->pidmap[0].page); | 	kfree(ns->pidmap[0].page); | ||||||
| out_free: | out_free: | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user