From be6d3e56a6b9b3a4ee44a0685e39e595073c6f0d Mon Sep 17 00:00:00 2001 From: Kentaro Takeda Date: Wed, 17 Dec 2008 13:24:15 +0900 Subject: [PATCH 01/34] introduce new LSM hooks where vfsmount is available. Add new LSM hooks for path-based checks. Call them on directory-modifying operations at the points where we still know the vfsmount involved. Signed-off-by: Kentaro Takeda Signed-off-by: Tetsuo Handa Signed-off-by: Toshiharu Harada Signed-off-by: Al Viro --- fs/namei.c | 36 ++++++++++ fs/open.c | 5 ++ include/linux/security.h | 137 +++++++++++++++++++++++++++++++++++++++ net/unix/af_unix.c | 4 ++ security/Kconfig | 9 +++ security/capability.c | 57 ++++++++++++++++ security/security.c | 66 +++++++++++++++++++ 7 files changed, 314 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index af3783fff1de..ab441af4196b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1556,6 +1556,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) * Refuse to truncate files with mandatory locks held on them. */ error = locks_verify_locked(inode); + if (!error) + error = security_path_truncate(&nd->path, 0, + ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); if (!error) { DQUOT_INIT(inode); @@ -1586,7 +1589,11 @@ static int __open_namei_create(struct nameidata *nd, struct path *path, if (!IS_POSIXACL(dir->d_inode)) mode &= ~current->fs->umask; + error = security_path_mknod(&nd->path, path->dentry, mode, 0); + if (error) + goto out_unlock; error = vfs_create(dir->d_inode, path->dentry, mode, nd); +out_unlock: mutex_unlock(&dir->d_inode->i_mutex); dput(nd->path.dentry); nd->path.dentry = path->dentry; @@ -1999,6 +2006,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; + error = security_path_mknod(&nd.path, dentry, mode, dev); + if (error) + goto out_drop_write; switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); @@ -2011,6 +2021,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); break; } +out_drop_write: mnt_drop_write(nd.path.mnt); out_dput: dput(dentry); @@ -2070,7 +2081,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode) error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; + error = security_path_mkdir(&nd.path, dentry, mode); + if (error) + goto out_drop_write; error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); +out_drop_write: mnt_drop_write(nd.path.mnt); out_dput: dput(dentry); @@ -2180,7 +2195,11 @@ static long do_rmdir(int dfd, const char __user *pathname) error = mnt_want_write(nd.path.mnt); if (error) goto exit3; + error = security_path_rmdir(&nd.path, dentry); + if (error) + goto exit4; error = vfs_rmdir(nd.path.dentry->d_inode, dentry); +exit4: mnt_drop_write(nd.path.mnt); exit3: dput(dentry); @@ -2265,7 +2284,11 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = mnt_want_write(nd.path.mnt); if (error) goto exit2; + error = security_path_unlink(&nd.path, dentry); + if (error) + goto exit3; error = vfs_unlink(nd.path.dentry->d_inode, dentry); +exit3: mnt_drop_write(nd.path.mnt); exit2: dput(dentry); @@ -2346,7 +2369,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname, error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; + error = security_path_symlink(&nd.path, dentry, from); + if (error) + goto out_drop_write; error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); +out_drop_write: mnt_drop_write(nd.path.mnt); out_dput: dput(dentry); @@ -2443,7 +2470,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; + error = security_path_link(old_path.dentry, &nd.path, new_dentry); + if (error) + goto out_drop_write; error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); +out_drop_write: mnt_drop_write(nd.path.mnt); out_dput: dput(new_dentry); @@ -2679,8 +2710,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname, error = mnt_want_write(oldnd.path.mnt); if (error) goto exit5; + error = security_path_rename(&oldnd.path, old_dentry, + &newnd.path, new_dentry); + if (error) + goto exit6; error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); +exit6: mnt_drop_write(oldnd.path.mnt); exit5: dput(new_dentry); diff --git a/fs/open.c b/fs/open.c index c0a426d5766c..1cd7d40e9991 100644 --- a/fs/open.c +++ b/fs/open.c @@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) goto put_write_and_out; error = locks_verify_truncate(inode, NULL, length); + if (!error) + error = security_path_truncate(&path, length, 0); if (!error) { DQUOT_INIT(inode); error = do_truncate(path.dentry, length, 0, NULL); @@ -328,6 +330,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) goto out_putf; error = locks_verify_truncate(inode, file, length); + if (!error) + error = security_path_truncate(&file->f_path, length, + ATTR_MTIME|ATTR_CTIME); if (!error) error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); out_putf: diff --git a/include/linux/security.h b/include/linux/security.h index 3416cb85e77b..b92b5e453f64 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -335,17 +335,37 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @dir contains the inode structure of the parent directory of the new link. * @new_dentry contains the dentry structure for the new link. * Return 0 if permission is granted. + * @path_link: + * Check permission before creating a new hard link to a file. + * @old_dentry contains the dentry structure for an existing link + * to the file. + * @new_dir contains the path structure of the parent directory of + * the new link. + * @new_dentry contains the dentry structure for the new link. + * Return 0 if permission is granted. * @inode_unlink: * Check the permission to remove a hard link to a file. * @dir contains the inode structure of parent directory of the file. * @dentry contains the dentry structure for file to be unlinked. * Return 0 if permission is granted. + * @path_unlink: + * Check the permission to remove a hard link to a file. + * @dir contains the path structure of parent directory of the file. + * @dentry contains the dentry structure for file to be unlinked. + * Return 0 if permission is granted. * @inode_symlink: * Check the permission to create a symbolic link to a file. * @dir contains the inode structure of parent directory of the symbolic link. * @dentry contains the dentry structure of the symbolic link. * @old_name contains the pathname of file. * Return 0 if permission is granted. + * @path_symlink: + * Check the permission to create a symbolic link to a file. + * @dir contains the path structure of parent directory of + * the symbolic link. + * @dentry contains the dentry structure of the symbolic link. + * @old_name contains the pathname of file. + * Return 0 if permission is granted. * @inode_mkdir: * Check permissions to create a new directory in the existing directory * associated with inode strcture @dir. @@ -353,11 +373,25 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @dentry contains the dentry structure of new directory. * @mode contains the mode of new directory. * Return 0 if permission is granted. + * @path_mkdir: + * Check permissions to create a new directory in the existing directory + * associated with path strcture @path. + * @dir containst the path structure of parent of the directory + * to be created. + * @dentry contains the dentry structure of new directory. + * @mode contains the mode of new directory. + * Return 0 if permission is granted. * @inode_rmdir: * Check the permission to remove a directory. * @dir contains the inode structure of parent of the directory to be removed. * @dentry contains the dentry structure of directory to be removed. * Return 0 if permission is granted. + * @path_rmdir: + * Check the permission to remove a directory. + * @dir contains the path structure of parent of the directory to be + * removed. + * @dentry contains the dentry structure of directory to be removed. + * Return 0 if permission is granted. * @inode_mknod: * Check permissions when creating a special file (or a socket or a fifo * file created via the mknod system call). Note that if mknod operation @@ -368,6 +402,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @mode contains the mode of the new file. * @dev contains the device number. * Return 0 if permission is granted. + * @path_mknod: + * Check permissions when creating a file. Note that this hook is called + * even if mknod operation is being done for a regular file. + * @dir contains the path structure of parent of the new file. + * @dentry contains the dentry structure of the new file. + * @mode contains the mode of the new file. + * @dev contains the undecoded device number. Use new_decode_dev() to get + * the decoded device number. + * Return 0 if permission is granted. * @inode_rename: * Check for permission to rename a file or directory. * @old_dir contains the inode structure for parent of the old link. @@ -375,6 +418,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @new_dir contains the inode structure for parent of the new link. * @new_dentry contains the dentry structure of the new link. * Return 0 if permission is granted. + * @path_rename: + * Check for permission to rename a file or directory. + * @old_dir contains the path structure for parent of the old link. + * @old_dentry contains the dentry structure of the old link. + * @new_dir contains the path structure for parent of the new link. + * @new_dentry contains the dentry structure of the new link. + * Return 0 if permission is granted. * @inode_readlink: * Check the permission to read the symbolic link. * @dentry contains the dentry structure for the file link. @@ -403,6 +453,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @dentry contains the dentry structure for the file. * @attr is the iattr structure containing the new file attributes. * Return 0 if permission is granted. + * @path_truncate: + * Check permission before truncating a file. + * @path contains the path structure for the file. + * @length is the new length of the file. + * @time_attrs is the flags passed to do_truncate(). + * Return 0 if permission is granted. * @inode_getattr: * Check permission before obtaining file attributes. * @mnt is the vfsmount where the dentry was looked up @@ -1331,6 +1387,22 @@ struct security_operations { struct super_block *newsb); int (*sb_parse_opts_str) (char *options, struct security_mnt_opts *opts); +#ifdef CONFIG_SECURITY_PATH + int (*path_unlink) (struct path *dir, struct dentry *dentry); + int (*path_mkdir) (struct path *dir, struct dentry *dentry, int mode); + int (*path_rmdir) (struct path *dir, struct dentry *dentry); + int (*path_mknod) (struct path *dir, struct dentry *dentry, int mode, + unsigned int dev); + int (*path_truncate) (struct path *path, loff_t length, + unsigned int time_attrs); + int (*path_symlink) (struct path *dir, struct dentry *dentry, + const char *old_name); + int (*path_link) (struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry); + int (*path_rename) (struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, struct dentry *new_dentry); +#endif + int (*inode_alloc_security) (struct inode *inode); void (*inode_free_security) (struct inode *inode); int (*inode_init_security) (struct inode *inode, struct inode *dir, @@ -2705,6 +2777,71 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi #endif /* CONFIG_SECURITY_NETWORK_XFRM */ +#ifdef CONFIG_SECURITY_PATH +int security_path_unlink(struct path *dir, struct dentry *dentry); +int security_path_mkdir(struct path *dir, struct dentry *dentry, int mode); +int security_path_rmdir(struct path *dir, struct dentry *dentry); +int security_path_mknod(struct path *dir, struct dentry *dentry, int mode, + unsigned int dev); +int security_path_truncate(struct path *path, loff_t length, + unsigned int time_attrs); +int security_path_symlink(struct path *dir, struct dentry *dentry, + const char *old_name); +int security_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry); +int security_path_rename(struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, struct dentry *new_dentry); +#else /* CONFIG_SECURITY_PATH */ +static inline int security_path_unlink(struct path *dir, struct dentry *dentry) +{ + return 0; +} + +static inline int security_path_mkdir(struct path *dir, struct dentry *dentry, + int mode) +{ + return 0; +} + +static inline int security_path_rmdir(struct path *dir, struct dentry *dentry) +{ + return 0; +} + +static inline int security_path_mknod(struct path *dir, struct dentry *dentry, + int mode, unsigned int dev) +{ + return 0; +} + +static inline int security_path_truncate(struct path *path, loff_t length, + unsigned int time_attrs) +{ + return 0; +} + +static inline int security_path_symlink(struct path *dir, struct dentry *dentry, + const char *old_name) +{ + return 0; +} + +static inline int security_path_link(struct dentry *old_dentry, + struct path *new_dir, + struct dentry *new_dentry) +{ + return 0; +} + +static inline int security_path_rename(struct path *old_dir, + struct dentry *old_dentry, + struct path *new_dir, + struct dentry *new_dentry) +{ + return 0; +} +#endif /* CONFIG_SECURITY_PATH */ + #ifdef CONFIG_KEYS #ifdef CONFIG_SECURITY diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c6250d0055d2..d1b89820ab4f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -836,7 +836,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = mnt_want_write(nd.path.mnt); if (err) goto out_mknod_dput; + err = security_path_mknod(&nd.path, dentry, mode, 0); + if (err) + goto out_mknod_drop_write; err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0); +out_mknod_drop_write: mnt_drop_write(nd.path.mnt); if (err) goto out_mknod_dput; diff --git a/security/Kconfig b/security/Kconfig index d9f47ce7e207..9438535d7fd0 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -81,6 +81,15 @@ config SECURITY_NETWORK_XFRM IPSec. If you are unsure how to answer this question, answer N. +config SECURITY_PATH + bool "Security hooks for pathname based access control" + depends on SECURITY + help + This enables the security hooks for pathname based access control. + If enabled, a security module can use these hooks to + implement pathname based access controls. + If you are unsure how to answer this question, answer N. + config SECURITY_FILE_CAPABILITIES bool "File POSIX Capabilities" default n diff --git a/security/capability.c b/security/capability.c index 2dce66fcb992..c545bd1300b5 100644 --- a/security/capability.c +++ b/security/capability.c @@ -263,6 +263,53 @@ static void cap_inode_getsecid(const struct inode *inode, u32 *secid) *secid = 0; } +#ifdef CONFIG_SECURITY_PATH +static int cap_path_mknod(struct path *dir, struct dentry *dentry, int mode, + unsigned int dev) +{ + return 0; +} + +static int cap_path_mkdir(struct path *dir, struct dentry *dentry, int mode) +{ + return 0; +} + +static int cap_path_rmdir(struct path *dir, struct dentry *dentry) +{ + return 0; +} + +static int cap_path_unlink(struct path *dir, struct dentry *dentry) +{ + return 0; +} + +static int cap_path_symlink(struct path *dir, struct dentry *dentry, + const char *old_name) +{ + return 0; +} + +static int cap_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry) +{ + return 0; +} + +static int cap_path_rename(struct path *old_path, struct dentry *old_dentry, + struct path *new_path, struct dentry *new_dentry) +{ + return 0; +} + +static int cap_path_truncate(struct path *path, loff_t length, + unsigned int time_attrs) +{ + return 0; +} +#endif + static int cap_file_permission(struct file *file, int mask) { return 0; @@ -883,6 +930,16 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, inode_setsecurity); set_to_cap_if_null(ops, inode_listsecurity); set_to_cap_if_null(ops, inode_getsecid); +#ifdef CONFIG_SECURITY_PATH + set_to_cap_if_null(ops, path_mknod); + set_to_cap_if_null(ops, path_mkdir); + set_to_cap_if_null(ops, path_rmdir); + set_to_cap_if_null(ops, path_unlink); + set_to_cap_if_null(ops, path_symlink); + set_to_cap_if_null(ops, path_link); + set_to_cap_if_null(ops, path_rename); + set_to_cap_if_null(ops, path_truncate); +#endif set_to_cap_if_null(ops, file_permission); set_to_cap_if_null(ops, file_alloc_security); set_to_cap_if_null(ops, file_free_security); diff --git a/security/security.c b/security/security.c index d85dbb37c972..678d4d07b852 100644 --- a/security/security.c +++ b/security/security.c @@ -355,6 +355,72 @@ int security_inode_init_security(struct inode *inode, struct inode *dir, } EXPORT_SYMBOL(security_inode_init_security); +#ifdef CONFIG_SECURITY_PATH +int security_path_mknod(struct path *path, struct dentry *dentry, int mode, + unsigned int dev) +{ + if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + return 0; + return security_ops->path_mknod(path, dentry, mode, dev); +} +EXPORT_SYMBOL(security_path_mknod); + +int security_path_mkdir(struct path *path, struct dentry *dentry, int mode) +{ + if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + return 0; + return security_ops->path_mkdir(path, dentry, mode); +} + +int security_path_rmdir(struct path *path, struct dentry *dentry) +{ + if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + return 0; + return security_ops->path_rmdir(path, dentry); +} + +int security_path_unlink(struct path *path, struct dentry *dentry) +{ + if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + return 0; + return security_ops->path_unlink(path, dentry); +} + +int security_path_symlink(struct path *path, struct dentry *dentry, + const char *old_name) +{ + if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + return 0; + return security_ops->path_symlink(path, dentry, old_name); +} + +int security_path_link(struct dentry *old_dentry, struct path *new_dir, + struct dentry *new_dentry) +{ + if (unlikely(IS_PRIVATE(old_dentry->d_inode))) + return 0; + return security_ops->path_link(old_dentry, new_dir, new_dentry); +} + +int security_path_rename(struct path *old_dir, struct dentry *old_dentry, + struct path *new_dir, struct dentry *new_dentry) +{ + if (unlikely(IS_PRIVATE(old_dentry->d_inode) || + (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode)))) + return 0; + return security_ops->path_rename(old_dir, old_dentry, new_dir, + new_dentry); +} + +int security_path_truncate(struct path *path, loff_t length, + unsigned int time_attrs) +{ + if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + return 0; + return security_ops->path_truncate(path, length, time_attrs); +} +#endif + int security_inode_create(struct inode *dir, struct dentry *dentry, int mode) { if (unlikely(IS_PRIVATE(dir))) From e2b689d82c0394e5239a3557a217f19e2f47f1be Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 4 Dec 2008 11:17:47 +0000 Subject: [PATCH 02/34] fs: reorder struct inotify_device on 64bits to remove padding Reorder struct inotify_device to remove 8 bytes of padding on 64bit builds, reducing size to 128 bytes . Therefore allocating from a smaller slab & using one fewer cachelines. Signed-off-by: Richard Kennedy ---- Hi, patch against 2.6.28-rc7. built & tested on AMDX2 desktop. I've not been able to send this to the listed inotify maintainers, I just get mail failures. So I guessed filesystem was the best home for it, hope that's ok. regards Richard Signed-off-by: Al Viro --- fs/inotify_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/inotify_user.c b/fs/inotify_user.c index e2425bbd871f..400f8064a548 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -76,10 +76,10 @@ struct inotify_device { struct mutex ev_mutex; /* protects event queue */ struct mutex up_mutex; /* synchronizes watch updates */ struct list_head events; /* list of queued events */ - atomic_t count; /* reference count */ struct user_struct *user; /* user who opened this dev */ struct inotify_handle *ih; /* inotify handle */ struct fasync_struct *fa; /* async notification */ + atomic_t count; /* reference count */ unsigned int queue_size; /* size of the queue (bytes) */ unsigned int event_count; /* number of pending events */ unsigned int max_events; /* maximum number of events */ From c2452f32786159ed85f0e4b21fec09258f822fc8 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 1 Dec 2008 09:33:43 +0100 Subject: [PATCH 03/34] shrink struct dentry struct dentry is one of the most critical structures in the kernel. So it's sad to see it going neglected. With CONFIG_PROFILING turned on (which is probably the common case at least for distros and kernel developers), sizeof(struct dcache) == 208 here (64-bit). This gives 19 objects per slab. I packed d_mounted into a hole, and took another 4 bytes off the inline name length to take the padding out from the end of the structure. This shinks it to 200 bytes. I could have gone the other way and increased the length to 40, but I'm aiming for a magic number, read on... I then got rid of the d_cookie pointer. This shrinks it to 192 bytes. Rant: why was this ever a good idea? The cookie system should increase its hash size or use a tree or something if lookups are a problem. Also the "fast dcookie lookups" in oprofile should be moved into the dcookie code -- how can oprofile possibly care about the dcookie_mutex? It gets dropped after get_dcookie() returns so it can't be providing any sort of protection. At 192 bytes, 21 objects fit into a 4K page, saving about 3MB on my system with ~140 000 entries allocated. 192 is also a multiple of 64, so we get nice cacheline alignment on 64 and 32 byte line systems -- any given dentry will now require 3 cachelines to touch all fields wheras previously it would require 4. I know the inline name size was chosen quite carefully, however with the reduction in cacheline footprint, it should actually be just about as fast to do a name lookup for a 36 character name as it was before the patch (and faster for other sizes). The memory footprint savings for names which are <= 32 or > 36 bytes long should more than make up for the memory cost for 33-36 byte names. Performance is a feature... Signed-off-by: Al Viro --- arch/powerpc/oprofile/cell/spu_task_sync.c | 2 +- drivers/oprofile/buffer_sync.c | 2 +- fs/dcache.c | 4 ---- fs/dcookies.c | 28 +++++++++++++++------- include/linux/dcache.h | 21 ++++++++++------ 5 files changed, 35 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c index 2949126d28d1..6b793aeda72e 100644 --- a/arch/powerpc/oprofile/cell/spu_task_sync.c +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c @@ -297,7 +297,7 @@ static inline unsigned long fast_get_dcookie(struct path *path) { unsigned long cookie; - if (path->dentry->d_cookie) + if (path->dentry->d_flags & DCACHE_COOKIE) return (unsigned long)path->dentry; get_dcookie(path, &cookie); return cookie; diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index 737bd9484822..65e8294a9e29 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -200,7 +200,7 @@ static inline unsigned long fast_get_dcookie(struct path *path) { unsigned long cookie; - if (path->dentry->d_cookie) + if (path->dentry->d_flags & DCACHE_COOKIE) return (unsigned long)path->dentry; get_dcookie(path, &cookie); return cookie; diff --git a/fs/dcache.c b/fs/dcache.c index a1d86c7f3e66..fd244c7a7cc0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -34,7 +34,6 @@ #include #include "internal.h" - int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); @@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_op = NULL; dentry->d_fsdata = NULL; dentry->d_mounted = 0; -#ifdef CONFIG_PROFILING - dentry->d_cookie = NULL; -#endif INIT_HLIST_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); diff --git a/fs/dcookies.c b/fs/dcookies.c index 855d4b1d619a..180e9fec4ad8 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path) { struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, GFP_KERNEL); + struct dentry *d; if (!dcs) return NULL; - path->dentry->d_cookie = dcs; + d = path->dentry; + spin_lock(&d->d_lock); + d->d_flags |= DCACHE_COOKIE; + spin_unlock(&d->d_lock); + dcs->path = *path; path_get(path); hash_dcookie(dcs); @@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie) goto out; } - dcs = path->dentry->d_cookie; - - if (!dcs) + if (path->dentry->d_flags & DCACHE_COOKIE) { + dcs = find_dcookie((unsigned long)path->dentry); + } else { dcs = alloc_dcookie(path); - - if (!dcs) { - err = -ENOMEM; - goto out; + if (!dcs) { + err = -ENOMEM; + goto out; + } } *cookie = dcookie_value(dcs); @@ -251,7 +256,12 @@ out_kmem: static void free_dcookie(struct dcookie_struct * dcs) { - dcs->path.dentry->d_cookie = NULL; + struct dentry *d = dcs->path.dentry; + + spin_lock(&d->d_lock); + d->d_flags &= ~DCACHE_COOKIE; + spin_unlock(&d->d_lock); + path_put(&dcs->path); kmem_cache_free(dcookie_cache, dcs); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a37359d0bad1..c66d22487bf8 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -75,14 +75,22 @@ full_name_hash(const unsigned char *name, unsigned int len) return end_name_hash(hash); } -struct dcookie_struct; - -#define DNAME_INLINE_LEN_MIN 36 +/* + * Try to keep struct dentry aligned on 64 byte cachelines (this will + * give reasonable cacheline footprint with larger lines without the + * large memory footprint increase). + */ +#ifdef CONFIG_64BIT +#define DNAME_INLINE_LEN_MIN 32 /* 192 bytes */ +#else +#define DNAME_INLINE_LEN_MIN 40 /* 128 bytes */ +#endif struct dentry { atomic_t d_count; unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ + int d_mounted; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ /* @@ -107,10 +115,7 @@ struct dentry { struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ void *d_fsdata; /* fs-specific data */ -#ifdef CONFIG_PROFILING - struct dcookie_struct *d_cookie; /* cookie, if any */ -#endif - int d_mounted; + unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ }; @@ -177,6 +182,8 @@ d_iput: no no no yes #define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */ +#define DCACHE_COOKIE 0x0040 /* For use by dcookie subsystem */ + extern spinlock_t dcache_lock; extern seqlock_t rename_lock; From dded4f4d5048e64a01cf52eed4d27c8cb2600525 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 1 Dec 2008 14:34:50 -0800 Subject: [PATCH 04/34] include: linux/fs.h: put declarations in __KERNEL__ include/linux/fs.h contains externs for a bunch of variables. That obviously belongs under ifdef __KERNEL__. Signed-off-by: Jan Engelhardt Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/fs.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 001ded4845b4..c5e4c5c74034 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -21,7 +21,6 @@ /* Fixed constants first: */ #undef NR_OPEN -extern int sysctl_nr_open; #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ #define BLOCK_SIZE_BITS 10 @@ -38,21 +37,13 @@ struct files_stat_struct { int nr_free_files; /* read only */ int max_files; /* tunable */ }; -extern struct files_stat_struct files_stat; -extern int get_max_files(void); struct inodes_stat_t { int nr_inodes; int nr_unused; int dummy[5]; /* padding for sysctl ABI compatibility */ }; -extern struct inodes_stat_t inodes_stat; -extern int leases_enable, lease_break_time; - -#ifdef CONFIG_DNOTIFY -extern int dir_notify_enable; -#endif #define NR_FILE 8192 /* this can well be larger on a larger system */ @@ -330,6 +321,15 @@ extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(unsigned long); +extern struct files_stat_struct files_stat; +extern int get_max_files(void); +extern int sysctl_nr_open; +extern struct inodes_stat_t inodes_stat; +extern int leases_enable, lease_break_time; +#ifdef CONFIG_DNOTIFY +extern int dir_notify_enable; +#endif + struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); From 5cc4a0341a1295ea56b2e62eb70d96d8fdb94ded Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 1 Dec 2008 14:34:51 -0800 Subject: [PATCH 05/34] fs/namespace.c: drop code after return The extra semicolon serves no purpose. Signed-off-by: Julia Lawall Reviewed-by: Richard Genoud Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 1c09cab8f7cf..a40685d800a8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); - return ERR_PTR(-ENOMEM);; + return ERR_PTR(-ENOMEM); } spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); From a17d5232de7b53d34229de79ec22f4bb04adb7e4 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:10 +0000 Subject: [PATCH 06/34] eCryptfs: check readlink result was not an error before using it The result from readlink is being used to index into the link name buffer without checking whether it is a valid length. If readlink returns an error this will fault or cause memory corruption. Cc: Tyler Hicks Cc: Dustin Kirkland Cc: ecryptfs-devel@lists.launchpad.net Signed-off-by: Duane Griffin Acked-by: Michael Halcrow Acked-by: Tyler Hicks Signed-off-by: Al Viro --- fs/ecryptfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 89209f00f9c7..5e78fc179886 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -673,10 +673,11 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " "dentry->d_name.name = [%s]\n", dentry->d_name.name); rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); - buf[rc] = '\0'; set_fs(old_fs); if (rc < 0) goto out_free; + else + buf[rc] = '\0'; rc = 0; nd_set_link(nd, buf); goto out; From 035146851cfa2fe24c1d9dc7637bb009ad06b2f7 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:11 +0000 Subject: [PATCH 07/34] vfs: introduce helper function to safely NUL-terminate symlinks A number of filesystems were potentially triggering kernel bugs due to corrupted symlink names on disk. This function helps safely terminate the names. Cc: Al Viro Cc: Andrew Morton Signed-off-by: Duane Griffin Signed-off-by: Al Viro --- include/linux/namei.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/namei.h b/include/linux/namei.h index 99eb80306dc5..fc2e03579877 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -94,4 +94,9 @@ static inline char *nd_get_link(struct nameidata *nd) return nd->saved_names[nd->depth]; } +static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) +{ + ((char *) name)[min(len, maxlen)] = '\0'; +} + #endif /* _LINUX_NAMEI_H */ From ebd09abbd9699f328165aee50a070403fbf55a37 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:12 +0000 Subject: [PATCH 08/34] vfs: ensure page symlinks are NUL-terminated On-disk data corruption could cause a page link to have its i_size set to PAGE_SIZE (or a multiple thereof) and its contents all non-NUL. NUL-terminate the link name to ensure this doesn't cause further problems for the kernel. Cc: Al Viro Cc: Andrew Morton Signed-off-by: Duane Griffin Signed-off-by: Al Viro --- fs/namei.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ab441af4196b..9ed5e2818f80 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2786,13 +2786,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link) /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) { - struct page * page; + char *kaddr; + struct page *page; struct address_space *mapping = dentry->d_inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return (char*)page; *ppage = page; - return kmap(page); + kaddr = kmap(page); + nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); + return kaddr; } int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) From 8d6d0c4da2dbbe0a69fea3692146af39f139f8b4 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:13 +0000 Subject: [PATCH 09/34] ext2: ensure fast symlinks are NUL-terminated Ensure fast symlink targets are NUL-terminated, even if corrupted on-disk. Cc: Andrew Morton Signed-off-by: Duane Griffin Signed-off-by: Al Viro --- fs/ext2/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7658b33e2653..02b39a5deb74 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" #include "xip.h" @@ -1286,9 +1287,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) else inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISLNK(inode->i_mode)) { - if (ext2_inode_is_fast_symlink(inode)) + if (ext2_inode_is_fast_symlink(inode)) { inode->i_op = &ext2_fast_symlink_inode_operations; - else { + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { inode->i_op = &ext2_symlink_inode_operations; if (test_opt(inode->i_sb, NOBH)) inode->i_mapping->a_ops = &ext2_nobh_aops; From b5ed3112b5f74c8ec1c7aa03a76c596635e85197 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:14 +0000 Subject: [PATCH 10/34] ext3: ensure fast symlinks are NUL-terminated Ensure fast symlink targets are NUL-terminated, even if corrupted on-disk. Cc: Andrew Morton Cc: Stephen Tweedie Cc: linux-ext4@vger.kernel.org Signed-off-by: Duane Griffin Signed-off-by: Al Viro --- fs/ext3/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index f8424ad89971..c4bdccf976b5 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "xattr.h" #include "acl.h" @@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext3_dir_inode_operations; inode->i_fop = &ext3_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext3_inode_is_fast_symlink(inode)) + if (ext3_inode_is_fast_symlink(inode)) { inode->i_op = &ext3_fast_symlink_inode_operations; - else { + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); } From e83c1397cafc4e44f868289db5e417463c0d09a4 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:15 +0000 Subject: [PATCH 11/34] ext4: ensure fast symlinks are NUL-terminated Ensure fast symlink targets are NUL-terminated, even if corrupted on-disk. Cc: Andrew Morton Cc: Theodore Ts'o Cc: adilger@sun.com Cc: linux-ext4@vger.kernel.org Signed-off-by: Duane Griffin Signed-off-by: Al Viro --- fs/ext4/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index be21a5ae33cb..7c3325e0b005 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include "ext4_jbd2.h" @@ -4164,9 +4165,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext4_inode_is_fast_symlink(inode)) + if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; - else { + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } From 21acaf8e8da00235be59a3e489d5fa2a8721cafc Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:16 +0000 Subject: [PATCH 12/34] sysv: ensure fast symlinks are NUL-terminated Ensure fast symlink targets are NUL-terminated, even if corrupted on-disk. Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Duane Griffin Signed-off-by: Al Viro --- fs/sysv/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index df0d435baa48..3d81bf58dae2 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "sysv.h" @@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev) if (inode->i_blocks) { inode->i_op = &sysv_symlink_inode_operations; inode->i_mapping->a_ops = &sysv_aops; - } else + } else { inode->i_op = &sysv_fast_symlink_inode_operations; + nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size, + sizeof(SYSV_I(inode)->i_data) - 1); + } } else init_special_inode(inode, inode->i_mode, rdev); } From a63d0ff31a136bdf52350c4e6c2929eaf47ea2b2 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:17 +0000 Subject: [PATCH 13/34] freevxfs: ensure fast symlinks are NUL-terminated Ensure fast symlink targets are NUL-terminated, even if corrupted on-disk. Cc: Christoph Hellwig Signed-off-by: Duane Griffin Signed-off-by: Al Viro --- fs/freevxfs/vxfs_inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 9f3f2ceb73f0..03a6ea5e99f7 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino) if (!VXFS_ISIMMED(vip)) { ip->i_op = &page_symlink_inode_operations; ip->i_mapping->a_ops = &vxfs_aops; - } else + } else { ip->i_op = &vxfs_immed_symlink_iops; + vip->vii_immed.vi_immed[ip->i_size] = '\0'; + } } else init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev)); From 7df5fa06de89a4ac311957e0cb9c1d87552b4325 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Fri, 19 Dec 2008 20:47:18 +0000 Subject: [PATCH 14/34] befs: ensure fast symlinks are NUL-terminated Ensure fast symlink targets are NUL-terminated, even if corrupted on-disk. Cc: Sergey S. Kostyliov Signed-off-by: Duane Griffin Signed-off-by: Al Viro --- fs/befs/linuxvfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index b6dfee37c7b7..d06cb023ad02 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_size = 0; inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink, - BEFS_SYMLINK_LEN); + BEFS_SYMLINK_LEN - 1); + befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0'; } else { int num_blks; @@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) kfree(link); befs_error(sb, "Failed to read entire long symlink"); link = ERR_PTR(-EIO); + } else { + link[len - 1] = '\0'; } } else { link = befs_ino->i_data.symlink; From dc711ca35f9d95a1eec02118e0c298b5e3068315 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Nov 2008 15:03:50 -0500 Subject: [PATCH 15/34] fix switch_names() breakage in short-to-short case We want ->name.len to match the resulting name on *both* source and target Signed-off-by: Al Viro --- fs/dcache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index fd244c7a7cc0..eeafc14c2a14 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1616,8 +1616,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target) */ memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); + dentry->d_name.len = target->d_name.len; + return; } } + do_switch(dentry->d_name.len, target->d_name.len); } /* @@ -1677,7 +1680,6 @@ already_unhashed: /* Switch the names.. */ switch_names(dentry, target); - do_switch(dentry->d_name.len, target->d_name.len); do_switch(dentry->d_name.hash, target->d_name.hash); /* ... and switch the parents */ @@ -1787,7 +1789,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) struct dentry *dparent, *aparent; switch_names(dentry, anon); - do_switch(dentry->d_name.len, anon->d_name.len); do_switch(dentry->d_name.hash, anon->d_name.hash); dparent = dentry->d_parent; From be42c4c433c2c0d3f1583c08908fead00d36d222 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 1 Dec 2008 14:34:58 -0800 Subject: [PATCH 16/34] correct wrong function name of d_put in kernel document and source comment no function named d_put(), it should be dput(). Impact: fix document and comment, no functionality changed Signed-off-by: Zhao Lei Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- Documentation/filesystems/vfs.txt | 2 +- fs/dcache.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 5579bda58a6d..041cb771d505 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -931,7 +931,7 @@ manipulate dentries: d_lookup: look up a dentry given its parent and path name component It looks up the child of that given name from the dcache hash table. If it is found, the reference count is incremented - and the dentry is returned. The caller must use d_put() + and the dentry is returned. The caller must use dput() to free the dentry when it finishes using it. For further information on dentry locking, please refer to the document diff --git a/fs/dcache.c b/fs/dcache.c index eeafc14c2a14..c231a639c2a2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1332,7 +1332,7 @@ err_out: * * Searches the children of the parent dentry for the name in question. If * the dentry is found its reference count is incremented and the dentry - * is returned. The caller must use d_put to free the entry when it has + * is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned on failure. * * __d_lookup is dcache_lock free. The hash list is protected using RCU. From 52afeefb9dac9287429642189996426a2bfd6a25 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Dec 2008 14:35:00 -0800 Subject: [PATCH 17/34] expand some comments (d_path / seq_path) Explain that you really need to use the return value of d_path rather than the buffer you passed into it. Also fix the comment for seq_path(), the function arguments changed recently but the comment hadn't been updated in sync. Signed-off-by: Arjan van de Ven Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++++-- fs/seq_file.c | 10 ++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index c231a639c2a2..bdb3f50248a7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1908,7 +1908,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * - * Returns the buffer or an error code if the path was too long. + * Returns a pointer into the buffer or an error code if the + * path was too long. * * "buflen" should be positive. Caller holds the dcache_lock. * @@ -1984,7 +1985,10 @@ Elong: * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * - * Returns the buffer or an error code if the path was too long. + * Returns a pointer into the buffer or an error code if the path was + * too long. Note: Callers should use the returned pointer, not the passed + * in buffer, to use the name! The implementation often starts at an offset + * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ diff --git a/fs/seq_file.c b/fs/seq_file.c index 16c211558c22..99d8b8cfc9b7 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -389,8 +389,14 @@ char *mangle_path(char *s, char *p, char *esc) } EXPORT_SYMBOL(mangle_path); -/* - * return the absolute path of 'dentry' residing in mount 'mnt'. +/** + * seq_path - seq_file interface to print a pathname + * @m: the seq_file handle + * @path: the struct path to print + * @esc: set of characters to escape in the output + * + * return the absolute path of 'path', as represented by the + * dentry / mnt pair in the path parameter. */ int seq_path(struct seq_file *m, struct path *path, char *esc) { From 66f221875dc10813aa2f06c83ad60d0eb1356406 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2008 15:04:29 +0100 Subject: [PATCH 18/34] remove incorrect comment in inode_permission We now pass on all MAY_ flags to the filesystems permission routines, so remove the comment stating the contrary. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/namei.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 9ed5e2818f80..631cfdd45c68 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -247,7 +247,6 @@ int inode_permission(struct inode *inode, int mask) return -EACCES; } - /* Ordinary permission routines do not understand MAY_APPEND. */ if (inode->i_op && inode->i_op->permission) retval = inode->i_op->permission(inode, mask); else From b4091d5f6fde28ab762e1094a1a26d81f3badfa5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2008 15:07:21 +0100 Subject: [PATCH 19/34] kill walk_init_root walk_init_root is a tiny helper that is marked __always_inline, has just one caller and an unused argument. Just merge it into the caller. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/namei.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 631cfdd45c68..d4d0b59ed2cc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -526,18 +526,6 @@ out_unlock: return result; } -/* SMP-safe */ -static __always_inline void -walk_init_root(const char *name, struct nameidata *nd) -{ - struct fs_struct *fs = current->fs; - - read_lock(&fs->lock); - nd->path = fs->root; - path_get(&fs->root); - read_unlock(&fs->lock); -} - /* * Wrapper to retry pathname resolution whenever the underlying * file system returns an ESTALE. @@ -575,9 +563,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l goto fail; if (*link == '/') { + struct fs_struct *fs = current->fs; + path_put(&nd->path); - walk_init_root(link, nd); + + read_lock(&fs->lock); + nd->path = fs->root; + path_get(&fs->root); + read_unlock(&fs->lock); } + res = link_path_walk(link, nd); if (nd->depth || res || nd->last_type!=LAST_NORM) return res; From 3fb64190aa3c23c10e6e9fd0124ac030115c99bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2008 09:58:10 +0200 Subject: [PATCH 20/34] pass a struct path * to may_open No need for the nameidata in may_open - a struct path is enough. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/namei.c | 14 +++++++------- fs/nfsctl.c | 5 +++-- include/linux/fs.h | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d4d0b59ed2cc..5cc0dc95a7a5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1487,9 +1487,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, return error; } -int may_open(struct nameidata *nd, int acc_mode, int flag) +int may_open(struct path *path, int acc_mode, int flag) { - struct dentry *dentry = nd->path.dentry; + struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; @@ -1510,13 +1510,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { flag &= ~O_TRUNC; } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) { - if (nd->path.mnt->mnt_flags & MNT_NODEV) + if (path->mnt->mnt_flags & MNT_NODEV) return -EACCES; flag &= ~O_TRUNC; } - error = vfs_permission(nd, acc_mode); + error = inode_permission(inode, acc_mode); if (error) return error; /* @@ -1551,7 +1551,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) */ error = locks_verify_locked(inode); if (!error) - error = security_path_truncate(&nd->path, 0, + error = security_path_truncate(path, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); if (!error) { DQUOT_INIT(inode); @@ -1594,7 +1594,7 @@ out_unlock: if (error) return error; /* Don't check for write permission, don't truncate */ - return may_open(nd, 0, flag & ~O_TRUNC); + return may_open(&nd->path, 0, flag & ~O_TRUNC); } /* @@ -1780,7 +1780,7 @@ ok: if (error) goto exit; } - error = may_open(&nd, acc_mode, flag); + error = may_open(&nd.path, acc_mode, flag); if (error) { if (will_write) mnt_drop_write(nd.path.mnt); diff --git a/fs/nfsctl.c b/fs/nfsctl.c index b1acbd6ab6fb..b27451909dff 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags) return ERR_PTR(error); if (flags == O_RDWR) - error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE); + error = may_open(&nd.path, MAY_READ|MAY_WRITE, + FMODE_READ|FMODE_WRITE); else - error = may_open(&nd, MAY_WRITE, FMODE_WRITE); + error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE); if (!error) return dentry_open(nd.path.dentry, nd.path.mnt, flags, diff --git a/include/linux/fs.h b/include/linux/fs.h index c5e4c5c74034..3468df5a06e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1869,7 +1869,7 @@ extern void free_write_pipe(struct file *); extern struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode); -extern int may_open(struct nameidata *, int, int); +extern int may_open(struct path *, int, int); extern int kernel_read(struct file *, unsigned long, char *, unsigned long); extern struct file * open_exec(const char *); From cb23beb55100171646e69e248fb45f10db6e99a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2008 09:59:29 +0200 Subject: [PATCH 21/34] kill vfs_permission With all the nameidata removal there's no point anymore for this helper. Of the three callers left two will go away with the next lookup series anyway. Also add proper kerneldoc to inode_permission as this is the main permission check routine now. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/exec.c | 5 +++-- fs/namei.c | 31 +++++++++++++------------------ include/linux/fs.h | 1 - 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 02d2e120542d..dfbf7009fbe7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -127,7 +127,8 @@ asmlinkage long sys_uselib(const char __user * library) if (nd.path.mnt->mnt_flags & MNT_NOEXEC) goto exit; - error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN); + error = inode_permission(nd.path.dentry->d_inode, + MAY_READ | MAY_EXEC | MAY_OPEN); if (error) goto exit; @@ -680,7 +681,7 @@ struct file *open_exec(const char *name) if (nd.path.mnt->mnt_flags & MNT_NOEXEC) goto out_path_put; - err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN); + err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN); if (err) goto out_path_put; diff --git a/fs/namei.c b/fs/namei.c index 5cc0dc95a7a5..3f88e043d459 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask, return -EACCES; } +/** + * inode_permission - check for access rights to a given inode + * @inode: inode to check permission on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Used to check for read/write/execute permissions on an inode. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things. + */ int inode_permission(struct inode *inode, int mask) { int retval; @@ -263,21 +273,6 @@ int inode_permission(struct inode *inode, int mask) mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND)); } -/** - * vfs_permission - check for access rights to a given path - * @nd: lookup result that describes the path - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * - * Used to check for read/write/execute permissions on a path. - * We use "fsuid" for this, letting us set arbitrary permissions - * for filesystem access without changing the "normal" uids which - * are used for other things. - */ -int vfs_permission(struct nameidata *nd, int mask) -{ - return inode_permission(nd->path.dentry->d_inode, mask); -} - /** * file_permission - check for additional access rights to a given file * @file: file to check access rights for @@ -288,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask) * * Note: * Do not use this function in new code. All access checks should - * be done using vfs_permission(). + * be done using inode_permission(). */ int file_permission(struct file *file, int mask) { @@ -853,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); if (err == -EAGAIN) - err = vfs_permission(nd, MAY_EXEC); + err = inode_permission(nd->path.dentry->d_inode, + MAY_EXEC); if (err) break; @@ -2882,7 +2878,6 @@ EXPORT_SYMBOL(path_lookup); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(vfs_permission); EXPORT_SYMBOL(file_permission); EXPORT_SYMBOL(unlock_rename); EXPORT_SYMBOL(vfs_create); diff --git a/include/linux/fs.h b/include/linux/fs.h index 3468df5a06e0..fd615986a41c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1212,7 +1212,6 @@ extern void unlock_super(struct super_block *); /* * VFS helper functions.. */ -extern int vfs_permission(struct nameidata *, int); extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); extern int vfs_mkdir(struct inode *, struct dentry *, int); extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); From 18d8fda7c3c9439be04d7ea2e82da2513b121acb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Dec 2008 00:35:37 -0500 Subject: [PATCH 22/34] take init_fs to saner place Signed-off-by: Al Viro --- arch/alpha/kernel/init_task.c | 1 - arch/arm/kernel/init_task.c | 1 - arch/avr32/kernel/init_task.c | 1 - arch/blackfin/kernel/init_task.c | 1 - arch/cris/kernel/process.c | 1 - arch/frv/kernel/init_task.c | 1 - arch/h8300/kernel/init_task.c | 1 - arch/ia64/kernel/init_task.c | 1 - arch/m32r/kernel/init_task.c | 1 - arch/m68k/kernel/process.c | 1 - arch/m68knommu/kernel/init_task.c | 1 - arch/mips/kernel/init_task.c | 1 - arch/mn10300/kernel/init_task.c | 1 - arch/parisc/kernel/init_task.c | 1 - arch/powerpc/kernel/init_task.c | 1 - arch/s390/kernel/init_task.c | 1 - arch/sh/kernel/init_task.c | 1 - arch/sparc/kernel/init_task.c | 1 - arch/um/kernel/init_task.c | 1 - arch/x86/kernel/init_task.c | 1 - arch/xtensa/kernel/init_task.c | 1 - fs/namei.c | 7 +++++++ include/linux/fs_struct.h | 6 ------ include/linux/init_task.h | 1 + 24 files changed, 8 insertions(+), 27 deletions(-) diff --git a/arch/alpha/kernel/init_task.c b/arch/alpha/kernel/init_task.c index 1f762189fa64..c2938e574a40 100644 --- a/arch/alpha/kernel/init_task.c +++ b/arch/alpha/kernel/init_task.c @@ -8,7 +8,6 @@ #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c index 0bbf80625395..e859af349467 100644 --- a/arch/arm/kernel/init_task.c +++ b/arch/arm/kernel/init_task.c @@ -12,7 +12,6 @@ #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c index 44058469c6ec..993d56ee3cf3 100644 --- a/arch/avr32/kernel/init_task.c +++ b/arch/avr32/kernel/init_task.c @@ -13,7 +13,6 @@ #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/blackfin/kernel/init_task.c b/arch/blackfin/kernel/init_task.c index 6bdba7b21109..2c228c020978 100644 --- a/arch/blackfin/kernel/init_task.c +++ b/arch/blackfin/kernel/init_task.c @@ -33,7 +33,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index 5933656db5a2..60816e876455 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -37,7 +37,6 @@ * setup. */ -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c index e2198815b630..29429a8b7f6a 100644 --- a/arch/frv/kernel/init_task.c +++ b/arch/frv/kernel/init_task.c @@ -10,7 +10,6 @@ #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c index 93a4899e46c2..cb5dc552da97 100644 --- a/arch/h8300/kernel/init_task.c +++ b/arch/h8300/kernel/init_task.c @@ -12,7 +12,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c index 9d7e1c66faf4..5b0e830c6f33 100644 --- a/arch/ia64/kernel/init_task.c +++ b/arch/ia64/kernel/init_task.c @@ -17,7 +17,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c index 0d658dbb6766..016885c6f260 100644 --- a/arch/m32r/kernel/init_task.c +++ b/arch/m32r/kernel/init_task.c @@ -11,7 +11,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 3042c2bc8c58..632ce016014d 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -40,7 +40,6 @@ * alignment requirements and potentially different initial * setup. */ -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c index 344c01aede08..fe282de1d596 100644 --- a/arch/m68knommu/kernel/init_task.c +++ b/arch/m68knommu/kernel/init_task.c @@ -12,7 +12,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c index d72487ad7c15..149cd914526e 100644 --- a/arch/mips/kernel/init_task.c +++ b/arch/mips/kernel/init_task.c @@ -9,7 +9,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c index af16f6e5c918..5ac3566f8c98 100644 --- a/arch/mn10300/kernel/init_task.c +++ b/arch/mn10300/kernel/init_task.c @@ -18,7 +18,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c index f5941c086551..1e25a45d64c1 100644 --- a/arch/parisc/kernel/init_task.c +++ b/arch/parisc/kernel/init_task.c @@ -34,7 +34,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c index 4c85b8d56478..688b329800bd 100644 --- a/arch/powerpc/kernel/init_task.c +++ b/arch/powerpc/kernel/init_task.c @@ -7,7 +7,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c index e80716843619..7db95c0b8693 100644 --- a/arch/s390/kernel/init_task.c +++ b/arch/s390/kernel/init_task.c @@ -16,7 +16,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c index b151a25cb14d..80c35ff71d56 100644 --- a/arch/sh/kernel/init_task.c +++ b/arch/sh/kernel/init_task.c @@ -7,7 +7,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct pt_regs fake_swapper_regs; diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c index 62126e4cec54..f28cb8278e98 100644 --- a/arch/sparc/kernel/init_task.c +++ b/arch/sparc/kernel/init_task.c @@ -8,7 +8,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c index 910eda8fca18..806d381947bf 100644 --- a/arch/um/kernel/init_task.c +++ b/arch/um/kernel/init_task.c @@ -10,7 +10,6 @@ #include "linux/mqueue.h" #include "asm/uaccess.h" -static struct fs_struct init_fs = INIT_FS; struct mm_struct init_mm = INIT_MM(init_mm); static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index d39918076bb4..df3bf269beab 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -10,7 +10,6 @@ #include #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c index 3df469dbe814..e07f5c9fcd35 100644 --- a/arch/xtensa/kernel/init_task.c +++ b/arch/xtensa/kernel/init_task.c @@ -21,7 +21,6 @@ #include -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/fs/namei.c b/fs/namei.c index 3f88e043d459..e203691b9d12 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2893,3 +2893,10 @@ EXPORT_SYMBOL(vfs_symlink); EXPORT_SYMBOL(vfs_unlink); EXPORT_SYMBOL(dentry_unhash); EXPORT_SYMBOL(generic_readlink); + +/* to be mentioned only in INIT_TASK */ +struct fs_struct init_fs = { + .count = ATOMIC_INIT(1), + .lock = RW_LOCK_UNLOCKED, + .umask = 0022, +}; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 9e5a06e78d02..a97c053d3a9a 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -10,12 +10,6 @@ struct fs_struct { struct path root, pwd; }; -#define INIT_FS { \ - .count = ATOMIC_INIT(1), \ - .lock = RW_LOCK_UNLOCKED, \ - .umask = 0022, \ -} - extern struct kmem_cache *fs_cachep; extern void exit_fs(struct task_struct *); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 959f5522d10a..2f3c2d4ef73b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -12,6 +12,7 @@ #include extern struct files_struct init_files; +extern struct fs_struct init_fs; #define INIT_KIOCTX(name, which_mm) \ { \ From 1239f26c05899f1f3c541b41e719c59d58038786 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Dec 2008 18:37:28 -0500 Subject: [PATCH 23/34] make INIT_FS use the __RW_LOCK_UNLOCKED initialization [AV: rediffed on top of unification of init_fs] Initialization of init_fs still uses the deprecated RW_LOCK_UNLOCKED macro. This patch updates it to use the __RW_LOCK_UNLOCKED(lock) macro. Signed-off-by: Steven Rostedt Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index e203691b9d12..dd5c9f0bf829 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2897,6 +2897,6 @@ EXPORT_SYMBOL(generic_readlink); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .count = ATOMIC_INIT(1), - .lock = RW_LOCK_UNLOCKED, + .lock = __RW_LOCK_UNLOCKED(init_fs.lock), .umask = 0022, }; From fd659fd6275d3426d7967da1f0e3638bbbd2fedb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Dec 2008 09:35:45 -0800 Subject: [PATCH 24/34] fix f_count description in Documentation/filesystems/files.txt Documentation/filesystems/files.txt was not updated when f_count became an atomic_long_t. atomic_long_inc_not_zero() is now used instead of atomic_inc_not_zero() Signed-off-by: Al Viro --- Documentation/filesystems/files.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt index bb0142f61084..ac2facc50d2a 100644 --- a/Documentation/filesystems/files.txt +++ b/Documentation/filesystems/files.txt @@ -76,13 +76,13 @@ the fdtable structure - 5. Handling of the file structures is special. Since the look-up of the fd (fget()/fget_light()) are lock-free, it is possible that look-up may race with the last put() operation on the - file structure. This is avoided using atomic_inc_not_zero() + file structure. This is avoided using atomic_long_inc_not_zero() on ->f_count : rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (atomic_inc_not_zero(&file->f_count)) + if (atomic_long_inc_not_zero(&file->f_count)) *fput_needed = 1; else /* Didn't get the reference, someone's freed */ @@ -92,7 +92,7 @@ the fdtable structure - .... return file; - atomic_inc_not_zero() detects if refcounts is already zero or + atomic_long_inc_not_zero() detects if refcounts is already zero or goes to zero during increment. If it does, we fail fget()/fget_light(). From b6b3fdead251d432f32f2cfce2a893ab8a658110 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Dec 2008 09:35:45 -0800 Subject: [PATCH 25/34] filp_cachep can be static in fs/file_table.c Instead of creating the "filp" kmem_cache in vfs_caches_init(), we can do it a litle be later in files_init(), so that filp_cachep is static to fs/file_table.c Acked-by: Paul E. McKenney Signed-off-by: Eric Dumazet Signed-off-by: Al Viro --- fs/dcache.c | 6 ------ fs/file_table.c | 10 +++++++++- include/linux/fdtable.h | 2 -- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index bdb3f50248a7..e88c23b85a32 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2314,9 +2314,6 @@ static void __init dcache_init(void) /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __read_mostly; -/* SLAB cache for file structures */ -struct kmem_cache *filp_cachep __read_mostly; - EXPORT_SYMBOL(d_genocide); void __init vfs_caches_init_early(void) @@ -2338,9 +2335,6 @@ void __init vfs_caches_init(unsigned long mempages) names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - dcache_init(); inode_init(); files_init(mempages); diff --git a/fs/file_table.c b/fs/file_table.c index 0fbcacc3ea75..bbeeac6efa1a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -32,6 +32,9 @@ struct files_stat_struct files_stat = { /* public. Not pretty! */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +/* SLAB cache for file structures */ +static struct kmem_cache *filp_cachep __read_mostly; + static struct percpu_counter nr_files __cacheline_aligned_in_smp; static inline void file_free_rcu(struct rcu_head *head) @@ -397,7 +400,12 @@ too_bad: void __init files_init(unsigned long mempages) { int n; - /* One file with associated inode and dcache is very roughly 1K. + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + /* + * One file with associated inode and dcache is very roughly 1K. * Per default don't use more than 10% of our memory for files. */ diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 4aab6f12cfab..09d6c5bbdddd 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -57,8 +57,6 @@ struct files_struct { #define files_fdtable(files) (rcu_dereference((files)->fdt)) -extern struct kmem_cache *filp_cachep; - struct file_operations; struct vfsmount; struct dentry; From 6badd79bd002788aaec27b50a74ab69ef65ab8ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Dec 2008 00:57:40 -0500 Subject: [PATCH 26/34] kill ->dir_notify() Remove the hopelessly misguided ->dir_notify(). The only instance (cifs) has been broken by design from the very beginning; the objects it creates are never destroyed, keep references to struct file they can outlive, nothing that could possibly evict them exists on close(2) path *and* no locking whatsoever is done to prevent races with close(), should the previous, er, deficiencies someday be dealt with. Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 - Documentation/filesystems/vfs.txt | 3 - fs/bad_inode.c | 6 -- fs/cifs/Makefile | 2 +- fs/cifs/cifsfs.c | 7 -- fs/cifs/cifsfs.h | 1 - fs/cifs/fcntl.c | 118 ------------------------------ fs/dnotify.c | 3 - include/linux/fs.h | 1 - 9 files changed, 1 insertion(+), 142 deletions(-) delete mode 100644 fs/cifs/fcntl.c diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 23d2f4460deb..ccec55394380 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -394,7 +394,6 @@ prototypes: unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); - int (*dir_notify)(struct file *, unsigned long); }; locking rules: @@ -424,7 +423,6 @@ sendfile: no sendpage: no get_unmapped_area: no check_flags: no -dir_notify: no ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 041cb771d505..ef19afa186a9 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -733,7 +733,6 @@ struct file_operations { ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); - int (*dir_notify)(struct file *filp, unsigned long arg); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); @@ -800,8 +799,6 @@ otherwise noted. check_flags: called by the fcntl(2) system call for F_SETFL command - dir_notify: called by the fcntl(2) system call for F_NOTIFY command - flock: called by the flock(2) system call splice_write: called by the VFS to splice data from a pipe to a file. This diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 5f1538c03b1b..a05287a23f62 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags) return -EIO; } -static int bad_file_dir_notify(struct file *file, unsigned long arg) -{ - return -EIO; -} - static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) { return -EIO; @@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops = .sendpage = bad_file_sendpage, .get_unmapped_area = bad_file_get_unmapped_area, .check_flags = bad_file_check_flags, - .dir_notify = bad_file_dir_notify, .flock = bad_file_flock, .splice_write = bad_file_splice_write, .splice_read = bad_file_splice_read, diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 6ba43fb346fb..9948c0030e86 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ - md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \ + md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o cifsacl.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0005a194a75c..13ea53251dcf 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = { #endif /* CONFIG_CIFS_POSIX */ #ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = { #endif /* CONFIG_CIFS_POSIX */ .llseek = cifs_llseek, #ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = { #endif /* CONFIG_CIFS_POSIX */ #ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = { #endif /* CONFIG_CIFS_POSIX */ .llseek = cifs_llseek, #ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = { .readdir = cifs_readdir, .release = cifs_closedir, .read = generic_read_dir, -#ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, -#endif /* CONFIG_CIFS_EXPERIMENTAL */ .unlocked_ioctl = cifs_ioctl, .llseek = generic_file_llseek, }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 2ce04c73d74e..7ac481841f87 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); -extern int cifs_dir_notify(struct file *, unsigned long arg); /* Functions related to dir entries */ extern struct dentry_operations cifs_dentry_ops; diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c deleted file mode 100644 index 5a57581eb4b2..000000000000 --- a/fs/cifs/fcntl.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * fs/cifs/fcntl.c - * - * vfs operations that deal with the file control API - * - * Copyright (C) International Business Machines Corp., 2003,2004 - * Author(s): Steve French (sfrench@us.ibm.com) - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include -#include -#include -#include "cifsglob.h" -#include "cifsproto.h" -#include "cifs_unicode.h" -#include "cifs_debug.h" -#include "cifsfs.h" - -static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags) -{ - __u32 cifs_ntfy_flags = 0; - - /* No way on Linux VFS to ask to monitor xattr - changes (and no stream support either */ - if (fcntl_notify_flags & DN_ACCESS) - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS; - if (fcntl_notify_flags & DN_MODIFY) { - /* What does this mean on directories? */ - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE | - FILE_NOTIFY_CHANGE_SIZE; - } - if (fcntl_notify_flags & DN_CREATE) { - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION | - FILE_NOTIFY_CHANGE_LAST_WRITE; - } - if (fcntl_notify_flags & DN_DELETE) - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE; - if (fcntl_notify_flags & DN_RENAME) { - /* BB review this - checking various server behaviors */ - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME | - FILE_NOTIFY_CHANGE_FILE_NAME; - } - if (fcntl_notify_flags & DN_ATTRIB) { - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY | - FILE_NOTIFY_CHANGE_ATTRIBUTES; - } -/* if (fcntl_notify_flags & DN_MULTISHOT) { - cifs_ntfy_flags |= ; - } */ /* BB fixme - not sure how to handle this with CIFS yet */ - - return cifs_ntfy_flags; -} - -int cifs_dir_notify(struct file *file, unsigned long arg) -{ - int xid; - int rc = -EINVAL; - int oplock = 0; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; - char *full_path = NULL; - __u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES; - __u16 netfid; - - if (experimEnabled == 0) - return 0; - - xid = GetXid(); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - pTcon = cifs_sb->tcon; - - full_path = build_path_from_dentry(file->f_path.dentry); - - if (full_path == NULL) { - rc = -ENOMEM; - } else { - cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg)); - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, - GENERIC_READ | SYNCHRONIZE, 0 /* create options */, - &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - /* BB fixme - add this handle to a notify handle list */ - if (rc) { - cFYI(1, ("Could not open directory for notify")); - } else { - filter = convert_to_cifs_notify_flags(arg); - if (filter != 0) { - rc = CIFSSMBNotify(xid, pTcon, - 0 /* no subdirs */, netfid, - filter, file, arg & DN_MULTISHOT, - cifs_sb->local_nls); - } else { - rc = -EINVAL; - } - /* BB add code to close file eventually (at unmount - it would close automatically but may be a way - to do it easily when inode freed or when - notify info is cleared/changed */ - cFYI(1, ("notify rc %d", rc)); - } - } - - FreeXid(xid); - return rc; -} diff --git a/fs/dnotify.c b/fs/dnotify.c index 676073b8dda5..b0aa2cde80bd 100644 --- a/fs/dnotify.c +++ b/fs/dnotify.c @@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) dn->dn_next = inode->i_dnotify; inode->i_dnotify = dn; spin_unlock(&inode->i_lock); - - if (filp->f_op && filp->f_op->dir_notify) - return filp->f_op->dir_notify(filp, arg); return 0; out_free: diff --git a/include/linux/fs.h b/include/linux/fs.h index fd615986a41c..be16ce01fb1b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1309,7 +1309,6 @@ struct file_operations { ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); - int (*dir_notify)(struct file *filp, unsigned long arg); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); From c2acf7b90821785fe812cc0aa05148e5a1f84204 Mon Sep 17 00:00:00 2001 From: Denis ChengRq Date: Mon, 1 Dec 2008 14:34:56 -0800 Subject: [PATCH 27/34] fs/block_dev.c: __read_mostly improvement and sb_is_blkdev_sb utilization - iget5_locked in bdget really needs blockdev_superblock, instead of bd_mnt, so bd_mnt could be just a local variable; - blockdev_superblock really needs __read_mostly, while local var bd_mnt not; - make use of sb_is_blkdev_sb in bd_forget, instead of direct reference to blockdev_superblock. Signed-off-by: Denis ChengRq Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/block_dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 99e0ae1a4c78..349a26c10001 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -326,12 +326,13 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -static struct vfsmount *bd_mnt __read_mostly; -struct super_block *blockdev_superblock; +struct super_block *blockdev_superblock __read_mostly; void __init bdev_cache_init(void) { int err; + struct vfsmount *bd_mnt; + bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_PANIC), @@ -373,7 +374,7 @@ struct block_device *bdget(dev_t dev) struct block_device *bdev; struct inode *inode; - inode = iget5_locked(bd_mnt->mnt_sb, hash(dev), + inode = iget5_locked(blockdev_superblock, hash(dev), bdev_test, bdev_set, &dev); if (!inode) @@ -463,7 +464,7 @@ void bd_forget(struct inode *inode) spin_lock(&bdev_lock); if (inode->i_bdev) { - if (inode->i_sb != blockdev_superblock) + if (!sb_is_blkdev_sb(inode->i_sb)) bdev = inode->i_bdev; __bd_forget(inode); } From 272eb01485dda98e3b8910c7c1a53d597616b0a0 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 17 Dec 2008 13:59:41 -0500 Subject: [PATCH 28/34] filesystem notification: create fs/notify to contain all fs notification Creating a generic filesystem notification interface, fsnotify, which will be used by inotify, dnotify, and eventually fanotify is really starting to clutter the fs directory. This patch simply moves inotify and dnotify into fs/notify/inotify and fs/notify/dnotify respectively to make both current fs/ and future notification tidier. Signed-off-by: Eric Paris Signed-off-by: Al Viro --- fs/Kconfig | 39 +------------------------- fs/Makefile | 5 +--- fs/notify/Kconfig | 2 ++ fs/notify/Makefile | 2 ++ fs/notify/dnotify/Kconfig | 10 +++++++ fs/notify/dnotify/Makefile | 1 + fs/{ => notify/dnotify}/dnotify.c | 0 fs/notify/inotify/Kconfig | 27 ++++++++++++++++++ fs/notify/inotify/Makefile | 2 ++ fs/{ => notify/inotify}/inotify.c | 0 fs/{ => notify/inotify}/inotify_user.c | 0 11 files changed, 46 insertions(+), 42 deletions(-) create mode 100644 fs/notify/Kconfig create mode 100644 fs/notify/Makefile create mode 100644 fs/notify/dnotify/Kconfig create mode 100644 fs/notify/dnotify/Makefile rename fs/{ => notify/dnotify}/dnotify.c (100%) create mode 100644 fs/notify/inotify/Kconfig create mode 100644 fs/notify/inotify/Makefile rename fs/{ => notify/inotify}/inotify.c (100%) rename fs/{ => notify/inotify}/inotify_user.c (100%) diff --git a/fs/Kconfig b/fs/Kconfig index 522469a7eca3..ff0e81980207 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -270,44 +270,7 @@ config OCFS2_COMPAT_JBD endif # BLOCK -config DNOTIFY - bool "Dnotify support" - default y - help - Dnotify is a directory-based per-fd file change notification system - that uses signals to communicate events to user-space. There exist - superior alternatives, but some applications may still rely on - dnotify. - - If unsure, say Y. - -config INOTIFY - bool "Inotify file change notification support" - default y - ---help--- - Say Y here to enable inotify support. Inotify is a file change - notification system and a replacement for dnotify. Inotify fixes - numerous shortcomings in dnotify and introduces several new features - including multiple file events, one-shot support, and unmount - notification. - - For more information, see - - If unsure, say Y. - -config INOTIFY_USER - bool "Inotify support for userspace" - depends on INOTIFY - default y - ---help--- - Say Y here to enable inotify support for userspace, including the - associated system calls. Inotify allows monitoring of both files and - directories via a single open fd. Events are read from the file - descriptor, which is also select()- and poll()-able. - - For more information, see - - If unsure, say Y. +source "fs/notify/Kconfig" config QUOTA bool "Quota support" diff --git a/fs/Makefile b/fs/Makefile index d9f8afe6f0c4..e6f423d1d228 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -20,8 +20,7 @@ obj-y += no-block.o endif obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o -obj-$(CONFIG_INOTIFY) += inotify.o -obj-$(CONFIG_INOTIFY_USER) += inotify_user.o +obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o @@ -57,8 +56,6 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTACTL) += quota.o -obj-$(CONFIG_DNOTIFY) += dnotify.o - obj-$(CONFIG_PROC_FS) += proc/ obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig new file mode 100644 index 000000000000..50914d7303c6 --- /dev/null +++ b/fs/notify/Kconfig @@ -0,0 +1,2 @@ +source "fs/notify/dnotify/Kconfig" +source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/Makefile b/fs/notify/Makefile new file mode 100644 index 000000000000..5a95b6010ce7 --- /dev/null +++ b/fs/notify/Makefile @@ -0,0 +1,2 @@ +obj-y += dnotify/ +obj-y += inotify/ diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig new file mode 100644 index 000000000000..26adf5dfa646 --- /dev/null +++ b/fs/notify/dnotify/Kconfig @@ -0,0 +1,10 @@ +config DNOTIFY + bool "Dnotify support" + default y + help + Dnotify is a directory-based per-fd file change notification system + that uses signals to communicate events to user-space. There exist + superior alternatives, but some applications may still rely on + dnotify. + + If unsure, say Y. diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile new file mode 100644 index 000000000000..f145251dcadb --- /dev/null +++ b/fs/notify/dnotify/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_DNOTIFY) += dnotify.o diff --git a/fs/dnotify.c b/fs/notify/dnotify/dnotify.c similarity index 100% rename from fs/dnotify.c rename to fs/notify/dnotify/dnotify.c diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig new file mode 100644 index 000000000000..446792841023 --- /dev/null +++ b/fs/notify/inotify/Kconfig @@ -0,0 +1,27 @@ +config INOTIFY + bool "Inotify file change notification support" + default y + ---help--- + Say Y here to enable inotify support. Inotify is a file change + notification system and a replacement for dnotify. Inotify fixes + numerous shortcomings in dnotify and introduces several new features + including multiple file events, one-shot support, and unmount + notification. + + For more information, see + + If unsure, say Y. + +config INOTIFY_USER + bool "Inotify support for userspace" + depends on INOTIFY + default y + ---help--- + Say Y here to enable inotify support for userspace, including the + associated system calls. Inotify allows monitoring of both files and + directories via a single open fd. Events are read from the file + descriptor, which is also select()- and poll()-able. + + For more information, see + + If unsure, say Y. diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile new file mode 100644 index 000000000000..e290f3bb9d8d --- /dev/null +++ b/fs/notify/inotify/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_INOTIFY) += inotify.o +obj-$(CONFIG_INOTIFY_USER) += inotify_user.o diff --git a/fs/inotify.c b/fs/notify/inotify/inotify.c similarity index 100% rename from fs/inotify.c rename to fs/notify/inotify/inotify.c diff --git a/fs/inotify_user.c b/fs/notify/inotify/inotify_user.c similarity index 100% rename from fs/inotify_user.c rename to fs/notify/inotify/inotify_user.c From 261bca86ed4f7f391d1938167624e78da61dcc6b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 Dec 2008 01:48:21 -0500 Subject: [PATCH 29/34] nfsd/create race fixes, infrastructure new helpers - insert_inode_locked() and insert_inode_locked4(). Hash new inode, making sure that there's no such inode in icache already. If there is and it does not end up unhashed (as would happen if we have nfsd trying to resolve a bogus fhandle), fail. Otherwise insert our inode into hash and succeed. In either case have i_state set to new+locked; cleanup ends up being simpler with such calling conventions. Signed-off-by: Al Viro --- fs/inode.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 2 ++ 2 files changed, 61 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index 098a2443196f..7de1cda92489 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1032,6 +1032,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) EXPORT_SYMBOL(iget_locked); +int insert_inode_locked(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *old; + + inode->i_state |= I_LOCK|I_NEW; + while (1) { + spin_lock(&inode_lock); + old = find_inode_fast(sb, head, ino); + if (likely(!old)) { + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); + return 0; + } + __iget(old); + spin_unlock(&inode_lock); + wait_on_inode(old); + if (unlikely(!hlist_unhashed(&old->i_hash))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} + +EXPORT_SYMBOL(insert_inode_locked); + +int insert_inode_locked4(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct super_block *sb = inode->i_sb; + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *old; + + inode->i_state |= I_LOCK|I_NEW; + + while (1) { + spin_lock(&inode_lock); + old = find_inode(sb, head, test, data); + if (likely(!old)) { + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); + return 0; + } + __iget(old); + spin_unlock(&inode_lock); + wait_on_inode(old); + if (unlikely(!hlist_unhashed(&old->i_hash))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} + +EXPORT_SYMBOL(insert_inode_locked4); + /** * __insert_inode_hash - hash an inode * @inode: unhashed inode diff --git a/include/linux/fs.h b/include/linux/fs.h index be16ce01fb1b..e2170ee21e18 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1902,6 +1902,8 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino); extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); +extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); +extern int insert_inode_locked(struct inode *); extern void unlock_new_inode(struct inode *); extern void __iget(struct inode * inode); From 41080b5a240113328c607f22b849f653373db0ce Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 Dec 2008 01:52:35 -0500 Subject: [PATCH 30/34] nfsd race fixes: ext2 * make ext2_new_inode() put the inode into icache in locked state * do not unlock until the inode is fully set up; otherwise nfsd might pick it in half-baked state. * make sure that ext2_new_inode() does *not* lead to two inodes with the same inumber hashed at the same time; otherwise a bogus fhandle coming from nfsd might race with inode creation: nfsd: iget_locked() creates inode nfsd: try to read from disk, block on that. ext2_new_inode(): allocate inode with that inumber ext2_new_inode(): insert it into icache, set it up and dirty ext2_write_inode(): get the relevant part of inode table in cache, set the entry for our inode (and start writing to disk) nfsd: get CPU again, look into inode table, see nice and sane on-disk inode, set the in-core inode from it oops - we have two in-core inodes with the same inumber live in icache, both used for IO. Welcome to fs corruption... Signed-off-by: Al Viro --- fs/ext2/ialloc.c | 6 +++++- fs/ext2/namei.c | 15 ++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8d0add625870..c454d5db28a5 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -585,7 +585,10 @@ got: spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + err = -EINVAL; + goto fail_drop; + } if (DQUOT_ALLOC_INODE(inode)) { err = -EDQUOT; @@ -612,6 +615,7 @@ fail_drop: DQUOT_DROP(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; + unlock_new_inode(inode); iput(inode); return ERR_PTR(err); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 2a747252ec12..90ea17998a73 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) int err = ext2_add_link(dentry, inode); if (!err) { d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; } inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -170,6 +172,7 @@ out: out_fail: inode_dec_link_count(inode); + unlock_new_inode(inode); iput (inode); goto out; } @@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; + int err; if (inode->i_nlink >= EXT2_LINK_MAX) return -EMLINK; @@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, inode_inc_link_count(inode); atomic_inc(&inode->i_count); - return ext2_add_nondir(dentry, inode); + err = ext2_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; } static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) @@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) goto out_fail; d_instantiate(dentry, inode); + unlock_new_inode(inode); out: return err; out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); out_dir: inode_dec_link_count(dir); From c38012daa7ad902a39a4213ba2b3fe50e81157ea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 Dec 2008 02:02:50 -0500 Subject: [PATCH 31/34] nfsd race fixes: ext3 ext3 analog of the previous patch Signed-off-by: Al Viro --- fs/ext3/ialloc.c | 6 +++++- fs/ext3/namei.c | 15 ++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 490bd0ed7896..5655fbcbd11f 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -579,7 +579,10 @@ got: ext3_set_inode_flags(inode); if (IS_DIRSYNC(inode)) handle->h_sync = 1; - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + err = -EINVAL; + goto fail_drop; + } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); @@ -627,6 +630,7 @@ fail_drop: DQUOT_DROP(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; + unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); return ERR_PTR(err); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 3e5edc92aa0b..297ea8dfac7c 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1652,9 +1652,11 @@ static int ext3_add_nondir(handle_t *handle, if (!err) { ext3_mark_inode_dirty(handle, inode); d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; } drop_nlink(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -1765,6 +1767,7 @@ retry: dir_block = ext3_bread (handle, inode, 0, 1, &err); if (!dir_block) { drop_nlink(inode); /* is this nlink == 0? */ + unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -1792,6 +1795,7 @@ retry: err = ext3_add_entry (handle, dentry, inode); if (err) { inode->i_nlink = 0; + unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -1800,6 +1804,7 @@ retry: ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); + unlock_new_inode(inode); out_stop: ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) @@ -2174,6 +2179,7 @@ retry: mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { drop_nlink(inode); + unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -2221,7 +2227,14 @@ retry: inc_nlink(inode); atomic_inc(&inode->i_count); - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_entry(handle, dentry, inode); + if (!err) { + ext3_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; From 6b38e842bb832a3dbeb17e382404aef3c40ac5f9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 Dec 2008 02:03:31 -0500 Subject: [PATCH 32/34] nfsd race fixes: ext4 Signed-off-by: Al Viro --- fs/ext4/ialloc.c | 6 +++++- fs/ext4/namei.c | 14 +++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 08cac9fcace2..6e6052879aa2 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -826,7 +826,10 @@ got: ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) handle->h_sync = 1; - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + err = -EINVAL; + goto fail_drop; + } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); @@ -881,6 +884,7 @@ fail_drop: DQUOT_DROP(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; + unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); return ERR_PTR(err); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 63adcb792988..da98a9012fa5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1693,9 +1693,11 @@ static int ext4_add_nondir(handle_t *handle, if (!err) { ext4_mark_inode_dirty(handle, inode); d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; } drop_nlink(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -1830,6 +1832,7 @@ retry: if (err) { out_clear_inode: clear_nlink(inode); + unlock_new_inode(inode); ext4_mark_inode_dirty(handle, inode); iput(inode); goto out_stop; @@ -1838,6 +1841,7 @@ out_clear_inode: ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); + unlock_new_inode(inode); out_stop: ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) @@ -2212,6 +2216,7 @@ retry: mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { clear_nlink(inode); + unlock_new_inode(inode); ext4_mark_inode_dirty(handle, inode); iput(inode); goto out_stop; @@ -2262,7 +2267,14 @@ retry: ext4_inc_count(handle, inode); atomic_inc(&inode->i_count); - err = ext4_add_nondir(handle, dentry, inode); + err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; From c1eaa26b671299b3ec01d40c6c71ee19a4f81517 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 Dec 2008 02:03:58 -0500 Subject: [PATCH 33/34] nfsd race fixes: reiserfs ... and the same for reiserfs. The difference here is that we need insert_inode_locked4() to match iget5_locked(). Signed-off-by: Al Viro --- fs/reiserfs/inode.c | 15 ++++++++++----- fs/reiserfs/namei.c | 8 ++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6c4c2c69449f..145c2d3e5e01 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, struct inode *inode) { struct super_block *sb; + struct reiserfs_iget_args args; INITIALIZE_PATH(path_to_key); struct cpu_key key; struct item_head ih; @@ -1780,6 +1781,14 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, err = -ENOMEM; goto out_bad_inode; } + args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); + memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); + args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); + if (insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args) < 0) { + err = -EINVAL; + goto out_bad_inode; + } if (old_format_only(sb)) /* not a perfect generation count, as object ids can be reused, but ** this is as good as reiserfs can do right now. @@ -1859,13 +1868,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } else { inode2sd(&sd, inode, inode->i_size); } - // these do not go to on-disk stat data - inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); - // store in in-core inode the key of stat data and version all // object items will have (directory items will have old offset // format, other new objects will consist of new items) - memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) set_inode_item_key_version(inode, KEY_FORMAT_3_5); else @@ -1929,7 +1934,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, reiserfs_mark_inode_private(inode); } - insert_inode_hash(inode); reiserfs_update_sd(th, inode); reiserfs_check_path(&path_to_key); @@ -1956,6 +1960,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, out_inserted_sd: inode->i_nlink = 0; th->t_trans_id = 0; /* so the caller can't use this handle later */ + unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ /* If we were inheriting an ACL, we need to release the lock so that * iput doesn't deadlock in reiserfs_delete_xattrs. The locking diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 4f322e5ed840..738967f6c8ee 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, err = journal_end(&th, dir->i_sb, jbegin_count); if (err) retval = err; + unlock_new_inode(inode); iput(inode); goto out_failed; } @@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, reiserfs_update_inode_transaction(dir); d_instantiate(dentry, inode); + unlock_new_inode(inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: @@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, err = journal_end(&th, dir->i_sb, jbegin_count); if (err) retval = err; + unlock_new_inode(inode); iput(inode); goto out_failed; } d_instantiate(dentry, inode); + unlock_new_inode(inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: @@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) err = journal_end(&th, dir->i_sb, jbegin_count); if (err) retval = err; + unlock_new_inode(inode); iput(inode); goto out_failed; } @@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) reiserfs_update_sd(&th, dir); d_instantiate(dentry, inode); + unlock_new_inode(inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: if (locked) @@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir, err = journal_end(&th, parent_dir->i_sb, jbegin_count); if (err) retval = err; + unlock_new_inode(inode); iput(inode); goto out_failed; } d_instantiate(dentry, inode); + unlock_new_inode(inode); retval = journal_end(&th, parent_dir->i_sb, jbegin_count); out_failed: reiserfs_write_unlock(parent_dir->i_sb); From 1f3403fa640f9f7b135dee79f2d39d01c8ad4a08 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 30 Dec 2008 22:08:37 -0600 Subject: [PATCH 34/34] nfsd race fixes: jfs jfs version of Al Viro's nfsd race patches Signed-off-by: Dave Kleikamp Signed-off-by: Al Viro --- fs/jfs/jfs_inode.c | 29 +++++++++++++++++++++-------- fs/jfs/namei.c | 24 ++++++++++++++++-------- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 70022fd1c539..d4d142c2edd4 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode) inode = new_inode(sb); if (!inode) { jfs_warn("ialloc: new_inode returned NULL!"); - return ERR_PTR(-ENOMEM); + rc = -ENOMEM; + goto fail; } jfs_inode = JFS_IP(inode); @@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_warn("ialloc: diAlloc returned %d!", rc); if (rc == -EIO) make_bad_inode(inode); - iput(inode); - return ERR_PTR(rc); + goto fail_put; + } + + if (insert_inode_locked(inode) < 0) { + rc = -EINVAL; + goto fail_unlock; } inode->i_uid = current_fsuid(); @@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode) * Allocate inode to quota. */ if (DQUOT_ALLOC_INODE(inode)) { - DQUOT_DROP(inode); - inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; - iput(inode); - return ERR_PTR(-EDQUOT); + rc = -EDQUOT; + goto fail_drop; } inode->i_mode = mode; @@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_info("ialloc returns inode = 0x%p\n", inode); return inode; + +fail_drop: + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; +fail_unlock: + inode->i_nlink = 0; + unlock_new_inode(inode); +fail_put: + iput(inode); +fail: + return ERR_PTR(rc); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index cc3cedffbfa1..b4de56b851e4 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, ip->i_fop = &jfs_file_operations; ip->i_mapping->a_ops = &jfs_aops; - insert_inode_hash(ip); mark_inode_dirty(ip); dip->i_ctime = dip->i_mtime = CURRENT_TIME; @@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; + unlock_new_inode(ip); iput(ip); - } else + } else { d_instantiate(dentry, ip); + unlock_new_inode(ip); + } out2: free_UCSname(&dname); @@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) ip->i_op = &jfs_dir_inode_operations; ip->i_fop = &jfs_dir_operations; - insert_inode_hash(ip); mark_inode_dirty(ip); /* update parent directory inode */ @@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; + unlock_new_inode(ip); iput(ip); - } else + } else { d_instantiate(dentry, ip); + unlock_new_inode(ip); + } out2: free_UCSname(&dname); @@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, goto out3; } - insert_inode_hash(ip); mark_inode_dirty(ip); dip->i_ctime = dip->i_mtime = CURRENT_TIME; @@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; + unlock_new_inode(ip); iput(ip); - } else + } else { d_instantiate(dentry, ip); + unlock_new_inode(ip); + } out2: free_UCSname(&dname); @@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_ip->dev = new_encode_dev(rdev); init_special_inode(ip, ip->i_mode, rdev); - insert_inode_hash(ip); mark_inode_dirty(ip); dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; + unlock_new_inode(ip); iput(ip); - } else + } else { d_instantiate(dentry, ip); + unlock_new_inode(ip); + } out1: free_UCSname(&dname);