diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index e667dbcd20e8..a91acd03ee12 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -630,7 +630,7 @@ static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq) _enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start); - subreq->max_len = ULONG_MAX; + subreq->max_len = MAX_RW_COUNT; subreq->max_nr_segs = BIO_MAX_VECS; if (!cachefiles_cres_file(cres)) { diff --git a/fs/inode.c b/fs/inode.c index f356fe2ec2b6..470b57ef1cf5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -676,6 +676,16 @@ static void evict(struct inode *inode) remove_inode_hash(inode); + /* + * Wake up waiters in __wait_on_freeing_inode(). + * + * Lockless hash lookup may end up finding the inode before we removed + * it above, but only lock it *after* we are done with the wakeup below. + * In this case the potential waiter cannot safely block. + * + * The inode being unhashed after the call to remove_inode_hash() is + * used as an indicator whether blocking on it is safe. + */ spin_lock(&inode->i_lock); wake_up_bit(&inode->i_state, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); @@ -888,18 +898,18 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } -static void __wait_on_freeing_inode(struct inode *inode, bool locked); +static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, bool locked) + void *data, bool is_inode_hash_locked) { struct inode *inode = NULL; - if (locked) + if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); @@ -913,7 +923,7 @@ repeat: continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode, locked); + __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { @@ -936,11 +946,11 @@ repeat: */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, - bool locked) + bool is_inode_hash_locked) { struct inode *inode = NULL; - if (locked) + if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); @@ -954,7 +964,7 @@ repeat: continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode, locked); + __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { @@ -2287,19 +2297,29 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ -static void __wait_on_freeing_inode(struct inode *inode, bool locked) +static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + + /* + * Handle racing against evict(), see that routine for more details. + */ + if (unlikely(inode_unhashed(inode))) { + WARN_ON(is_inode_hash_locked); + spin_unlock(&inode->i_lock); + return; + } + wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); - if (locked) + if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq, &wait.wq_entry); - if (locked) + if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); } diff --git a/fs/locks.c b/fs/locks.c index bdd94c32256f..9afb16e0683f 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2570,8 +2570,9 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, error = do_lock_file_wait(filp, cmd, file_lock); /* - * Attempt to detect a close/fcntl race and recover by releasing the - * lock that was just acquired. There is no need to do that when we're + * Detect close/fcntl races and recover by zapping all POSIX locks + * associated with this file and our files_struct, just like on + * filp_flush(). There is no need to do that when we're * unlocking though, or for OFD locks. */ if (!error && file_lock->c.flc_type != F_UNLCK && @@ -2586,9 +2587,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, f = files_lookup_fd_locked(files, fd); spin_unlock(&files->file_lock); if (f != filp) { - file_lock->c.flc_type = F_UNLCK; - error = do_lock_file_wait(filp, cmd, file_lock); - WARN_ON_ONCE(error); + locks_remove_posix(filp, files); error = -EBADF; } } diff --git a/fs/namei.c b/fs/namei.c index 3a4c40e12f78..5512cb10fa89 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3248,9 +3248,9 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory - * @mode: mode of the new file + * @dir: inode of the parent directory + * @dentry: dentry of the child file + * @mode: mode of the child file * @want_excl: whether the file must not yet exist * * Create a new file. @@ -4047,9 +4047,9 @@ EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file * @idmap: idmap of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory - * @mode: mode of the new device node or file + * @dir: inode of the parent directory + * @dentry: dentry of the child device node + * @mode: mode of the child device node * @dev: device number of device to create * * Create a device node or file. @@ -4174,9 +4174,9 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d /** * vfs_mkdir - create directory * @idmap: idmap of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory - * @mode: mode of the new directory + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @mode: mode of the child directory * * Create a directory. * @@ -4256,8 +4256,8 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /** * vfs_rmdir - remove directory * @idmap: idmap of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory + * @dir: inode of the parent directory + * @dentry: dentry of the child directory * * Remove a directory. * @@ -4537,8 +4537,8 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) /** * vfs_symlink - create symlink * @idmap: idmap of the mount the inode was found from - * @dir: inode of @dentry - * @dentry: pointer to dentry of the base directory + * @dir: inode of the parent directory + * @dentry: dentry of the child symlink file * @oldname: name of the file to link to * * Create a symlink. diff --git a/fs/namespace.c b/fs/namespace.c index 221db9de4729..328087a4df8a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -70,7 +70,7 @@ static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ -#define MNT_UNIQUE_ID_OFFSET (1ULL << 32) +#define MNT_UNIQUE_ID_OFFSET (1ULL << 31) static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET); static struct hlist_head *mount_hashtable __ro_after_init; diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index bec805e0c44c..1b78e8b65ebc 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -22,6 +22,14 @@ config NETFS_STATS between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. +config NETFS_DEBUG + bool "Enable dynamic debugging netfslib and FS-Cache" + depends on NETFS + help + This permits debugging to be dynamically enabled in the local caching + management module. If this is set, the debugging output may be + enabled by setting bits in /sys/module/netfs/parameters/debug. + config FSCACHE bool "General filesystem local caching manager" depends on NETFS_SUPPORT @@ -50,13 +58,3 @@ config FSCACHE_STATS debugging purposes. Saying 'Y' here is recommended. See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_DEBUG - bool "Debug FS-Cache" - depends on FSCACHE - help - This permits debugging to be dynamically enabled in the local caching - management module. If this is set, the debugging output may be - enabled by setting bits in /sys/modules/fscache/parameter/debug. - - See Documentation/filesystems/caching/fscache.rst for more information. diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a6d5d07cd436..a688d4c75d99 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -117,7 +117,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - kdebug("no unlock"); + _debug("no unlock"); else folio_unlock(folio); } @@ -204,7 +204,7 @@ void netfs_readahead(struct readahead_control *ractl) struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); int ret; - kenter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); + _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); if (readahead_count(ractl) == 0) return; @@ -268,7 +268,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) struct folio *sink = NULL; int ret; - kenter("%lx", folio->index); + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_pos(folio), folio_size(folio), @@ -508,7 +508,7 @@ retry: have_folio: *_folio = folio; - kleave(" = 0"); + _leave(" = 0"); return 0; error_put: @@ -518,7 +518,7 @@ error: folio_unlock(folio); folio_put(folio); } - kleave(" = %d", ret); + _leave(" = %d", ret); return ret; } EXPORT_SYMBOL(netfs_write_begin); @@ -536,7 +536,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t flen = folio_size(folio); int ret; - kenter("%zx @%llx", flen, start); + _enter("%zx @%llx", flen, start); ret = -ENOMEM; @@ -567,7 +567,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); error: - kleave(" = %d", ret); + _leave(" = %d", ret); return ret; } diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 68a3f1383cee..4726c315453c 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -56,7 +56,7 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, struct netfs_group *group = netfs_folio_group(folio); loff_t pos = folio_pos(folio); - kenter(""); + _enter(""); if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) return NETFS_FLUSH_CONTENT; @@ -272,12 +272,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, */ howto = netfs_how_to_modify(ctx, file, folio, netfs_group, flen, offset, part, maybe_trouble); - kdebug("howto %u", howto); + _debug("howto %u", howto); switch (howto) { case NETFS_JUST_PREFETCH: ret = netfs_prefetch_for_write(file, folio, offset, part); if (ret < 0) { - kdebug("prefetch = %zd", ret); + _debug("prefetch = %zd", ret); goto error_folio_unlock; } break; @@ -418,7 +418,7 @@ out: } iocb->ki_pos += written; - kleave(" = %zd [%zd]", written, ret); + _leave(" = %zd [%zd]", written, ret); return written ? written : ret; error_folio_unlock: @@ -491,7 +491,7 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct netfs_inode *ictx = netfs_inode(inode); ssize_t ret; - kenter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; @@ -529,7 +529,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr vm_fault_t ret = VM_FAULT_RETRY; int err; - kenter("%lx", folio->index); + _enter("%lx", folio->index); sb_start_pagefault(inode->i_sb); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index b6debac6205f..10a1e4da6bda 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -33,7 +33,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i size_t orig_count = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); - kenter(""); + _enter(""); if (!orig_count) return 0; /* Don't update atime */ diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 792ef17bae21..88f2adfab75e 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -37,7 +37,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * size_t len = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); - kenter(""); + _enter(""); /* We're going to need a bounce buffer if what we transmit is going to * be different in some way to the source buffer, e.g. because it gets @@ -45,7 +45,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * */ // TODO - kdebug("uw %llx-%llx", start, end); + _debug("uw %llx-%llx", start, end); wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start, iocb->ki_flags & IOCB_DIRECT ? @@ -96,7 +96,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * wreq->cleanup = netfs_cleanup_dio_write; ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); if (ret < 0) { - kdebug("begin = %zd", ret); + _debug("begin = %zd", ret); goto out; } @@ -143,7 +143,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t pos = iocb->ki_pos; unsigned long long end = pos + iov_iter_count(from) - 1; - kenter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); + _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index 288a73c3072d..9397ed39b0b4 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -237,7 +237,7 @@ int fscache_add_cache(struct fscache_cache *cache, { int n_accesses; - kenter("{%s,%s}", ops->name, cache->name); + _enter("{%s,%s}", ops->name, cache->name); BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING); @@ -257,7 +257,7 @@ int fscache_add_cache(struct fscache_cache *cache, up_write(&fscache_addremove_sem); pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name); - kleave(" = 0 [%s]", cache->name); + _leave(" = 0 [%s]", cache->name); return 0; } EXPORT_SYMBOL(fscache_add_cache); diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index 4d1e8bf4c615..bce2492186d0 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -456,7 +456,7 @@ struct fscache_cookie *__fscache_acquire_cookie( { struct fscache_cookie *cookie; - kenter("V=%x", volume->debug_id); + _enter("V=%x", volume->debug_id); if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255) return NULL; @@ -484,7 +484,7 @@ struct fscache_cookie *__fscache_acquire_cookie( trace_fscache_acquire(cookie); fscache_stat(&fscache_n_acquires_ok); - kleave(" = c=%08x", cookie->debug_id); + _leave(" = c=%08x", cookie->debug_id); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); @@ -505,7 +505,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed; bool need_withdraw = false; - kenter(""); + _enter(""); if (!cookie->volume->cache_priv) { fscache_create_volume(cookie->volume, true); @@ -519,7 +519,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) if (cookie->state != FSCACHE_COOKIE_STATE_FAILED) fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); need_withdraw = true; - kleave(" [fail]"); + _leave(" [fail]"); goto out; } @@ -572,7 +572,7 @@ void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify) bool queue = false; int n_active; - kenter("c=%08x", cookie->debug_id); + _enter("c=%08x", cookie->debug_id); if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), "Trying to use relinquished cookie\n")) @@ -636,7 +636,7 @@ again: spin_unlock(&cookie->lock); if (queue) fscache_queue_cookie(cookie, fscache_cookie_get_use_work); - kleave(""); + _leave(""); } EXPORT_SYMBOL(__fscache_use_cookie); @@ -702,7 +702,7 @@ static void fscache_cookie_state_machine(struct fscache_cookie *cookie) enum fscache_cookie_state state; bool wake = false; - kenter("c=%x", cookie->debug_id); + _enter("c=%x", cookie->debug_id); again: spin_lock(&cookie->lock); @@ -820,7 +820,7 @@ out: spin_unlock(&cookie->lock); if (wake) wake_up_cookie_state(cookie); - kleave(""); + _leave(""); } static void fscache_cookie_worker(struct work_struct *work) @@ -867,7 +867,7 @@ static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie) set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags); spin_unlock(&cookie->lock); fscache_stat(&fscache_n_cookies_lru_expired); - kdebug("lru c=%x", cookie->debug_id); + _debug("lru c=%x", cookie->debug_id); __fscache_withdraw_cookie(cookie); } @@ -971,7 +971,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) if (retire) fscache_stat(&fscache_n_relinquishes_retire); - kenter("c=%08x{%d},%d", + _enter("c=%08x{%d},%d", cookie->debug_id, atomic_read(&cookie->n_active), retire); if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), @@ -1050,7 +1050,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, { bool is_caching; - kenter("c=%x", cookie->debug_id); + _enter("c=%x", cookie->debug_id); fscache_stat(&fscache_n_invalidates); @@ -1072,7 +1072,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */ default: spin_unlock(&cookie->lock); - kleave(" [no %u]", cookie->state); + _leave(" [no %u]", cookie->state); return; case FSCACHE_COOKIE_STATE_LOOKING_UP: @@ -1081,7 +1081,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, fallthrough; case FSCACHE_COOKIE_STATE_CREATING: spin_unlock(&cookie->lock); - kleave(" [look %x]", cookie->inval_counter); + _leave(" [look %x]", cookie->inval_counter); return; case FSCACHE_COOKIE_STATE_ACTIVE: @@ -1094,7 +1094,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, if (is_caching) fscache_queue_cookie(cookie, fscache_cookie_get_inval_work); - kleave(" [inv]"); + _leave(" [inv]"); return; } } diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index bf4eaeec44fb..38637e5c9b57 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -28,12 +28,12 @@ bool fscache_wait_for_operation(struct netfs_cache_resources *cres, again: if (!fscache_cache_is_live(cookie->volume->cache)) { - kleave(" [broken]"); + _leave(" [broken]"); return false; } state = fscache_cookie_state(cookie); - kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state); + _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); switch (state) { case FSCACHE_COOKIE_STATE_CREATING: @@ -52,7 +52,7 @@ again: case FSCACHE_COOKIE_STATE_DROPPED: case FSCACHE_COOKIE_STATE_RELINQUISHING: default: - kleave(" [not live]"); + _leave(" [not live]"); return false; } @@ -92,7 +92,7 @@ again: spin_lock(&cookie->lock); state = fscache_cookie_state(cookie); - kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state); + _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); switch (state) { case FSCACHE_COOKIE_STATE_LOOKING_UP: @@ -140,7 +140,7 @@ failed: cres->cache_priv = NULL; cres->ops = NULL; fscache_end_cookie_access(cookie, fscache_access_io_not_live); - kleave(" = -ENOBUFS"); + _leave(" = -ENOBUFS"); return -ENOBUFS; } @@ -224,7 +224,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, if (len == 0) goto abandon; - kenter("%llx,%zx", start, len); + _enter("%llx,%zx", start, len); wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS); if (!wreq) diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index bf9b33d26e31..42e98bb523e3 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -99,7 +99,7 @@ error_wq: */ void __exit fscache_exit(void) { - kenter(""); + _enter(""); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index 2e2a405ca9b0..cb75c07b5281 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -264,7 +264,7 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key, fscache_see_volume(volume, fscache_volume_new_acquire); fscache_stat(&fscache_n_volumes); up_write(&fscache_addremove_sem); - kleave(" = v=%x", volume->debug_id); + _leave(" = v=%x", volume->debug_id); return volume; err_vol: @@ -466,7 +466,7 @@ void fscache_withdraw_volume(struct fscache_volume *volume) { int n_accesses; - kdebug("withdraw V=%x", volume->debug_id); + _debug("withdraw V=%x", volume->debug_id); /* Allow wakeups on dec-to-0 */ n_accesses = atomic_dec_return(&volume->n_accesses); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 21e46bc9aa49..7773f3d855a9 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -34,6 +34,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); /* * main.c */ +extern unsigned int netfs_debug; extern struct list_head netfs_io_requests; extern spinlock_t netfs_proc_lock; extern mempool_t netfs_request_pool; @@ -353,12 +354,42 @@ void fscache_create_volume(struct fscache_volume *volume, bool wait); * debug tracing */ #define dbgprintk(FMT, ...) \ - pr_debug("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) +#ifdef __KDEBUG +#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) +#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) +#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) + +#elif defined(CONFIG_NETFS_DEBUG) +#define _enter(FMT, ...) \ +do { \ + if (netfs_debug) \ + kenter(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _leave(FMT, ...) \ +do { \ + if (netfs_debug) \ + kleave(FMT, ##__VA_ARGS__); \ +} while (0) + +#define _debug(FMT, ...) \ +do { \ + if (netfs_debug) \ + kdebug(FMT, ##__VA_ARGS__); \ +} while (0) + +#else +#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) +#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) +#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) +#endif + /* * assertions */ diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c7576481c321..c93851b98368 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -130,7 +130,7 @@ static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, if (count == remaining) return; - kdebug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", rreq->debug_id, subreq->debug_index, iov_iter_count(&subreq->io_iter), subreq->transferred, subreq->len, rreq->i_size, @@ -326,7 +326,7 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; int u; - kenter("R=%x[%x]{%llx,%lx},%zd", + _enter("R=%x[%x]{%llx,%lx},%zd", rreq->debug_id, subreq->debug_index, subreq->start, subreq->flags, transferred_or_error); @@ -435,7 +435,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, struct netfs_inode *ictx = netfs_inode(rreq->inode); size_t lsize; - kenter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); + _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); if (rreq->origin != NETFS_DIO_READ) { source = netfs_cache_prepare_read(subreq, rreq->i_size); @@ -518,7 +518,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, subreq->start = rreq->start + rreq->submitted; subreq->len = io_iter->count; - kdebug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); + _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); /* Call out to the cache to find out what it can do with the remaining @@ -570,7 +570,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) struct iov_iter io_iter; int ret; - kenter("R=%x %llx-%llx", + _enter("R=%x %llx-%llx", rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); if (rreq->len == 0) { @@ -593,7 +593,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) atomic_set(&rreq->nr_outstanding, 1); io_iter = rreq->io_iter; do { - kdebug("submit %llx + %llx >= %llx", + _debug("submit %llx + %llx >= %llx", rreq->start, rreq->submitted, rreq->i_size); if (rreq->origin == NETFS_DIO_READ && rreq->start + rreq->submitted >= rreq->i_size) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index db824c372842..5f0f438e5d21 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -20,6 +20,10 @@ MODULE_LICENSE("GPL"); EXPORT_TRACEPOINT_SYMBOL(netfs_sreq); +unsigned netfs_debug; +module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); + static struct kmem_cache *netfs_request_slab; static struct kmem_cache *netfs_subrequest_slab; mempool_t netfs_request_pool; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 172808e83ca8..83e644bd518f 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -26,7 +26,7 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) struct fscache_cookie *cookie = netfs_i_cookie(ictx); bool need_use = false; - kenter(""); + _enter(""); if (!filemap_dirty_folio(mapping, folio)) return false; @@ -99,7 +99,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) struct netfs_folio *finfo; size_t flen = folio_size(folio); - kenter("{%lx},%zx,%zx", folio->index, offset, length); + _enter("{%lx},%zx,%zx", folio->index, offset, length); if (!folio_test_private(folio)) return; diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 488147439fe0..426cf87aaf2e 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -161,7 +161,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, { struct list_head *next; - kenter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); + _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); if (list_empty(&stream->subrequests)) return; @@ -374,7 +374,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) unsigned int notes; int s; - kenter("%llx-%llx", wreq->start, wreq->start + wreq->len); + _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); trace_netfs_collect(wreq); trace_netfs_rreq(wreq, netfs_rreq_trace_collect); @@ -409,7 +409,7 @@ reassess_streams: front = stream->front; while (front) { trace_netfs_collect_sreq(wreq, front); - //kdebug("sreq [%x] %llx %zx/%zx", + //_debug("sreq [%x] %llx %zx/%zx", // front->debug_index, front->start, front->transferred, front->len); /* Stall if there may be a discontinuity. */ @@ -598,7 +598,7 @@ reassess_streams: out: netfs_put_group_many(wreq->group, wreq->nr_group_rel); wreq->nr_group_rel = 0; - kleave(" = %x", notes); + _leave(" = %x", notes); return; need_retry: @@ -606,7 +606,7 @@ need_retry: * that any partially completed op will have had any wholly transferred * folios removed from it. */ - kdebug("retry"); + _debug("retry"); netfs_retry_writes(wreq); goto out; } @@ -621,7 +621,7 @@ void netfs_write_collection_worker(struct work_struct *work) size_t transferred; int s; - kenter("R=%x", wreq->debug_id); + _enter("R=%x", wreq->debug_id); netfs_see_request(wreq, netfs_rreq_trace_see_work); if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) { @@ -684,7 +684,7 @@ void netfs_write_collection_worker(struct work_struct *work) if (wreq->origin == NETFS_DIO_WRITE) inode_dio_end(wreq->inode); - kdebug("finished"); + _debug("finished"); trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); @@ -744,7 +744,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, struct netfs_io_request *wreq = subreq->rreq; struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; - kenter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); switch (subreq->source) { case NETFS_UPLOAD_TO_SERVER: diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index d7c971df8866..9258d30cffe3 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -99,7 +99,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, if (IS_ERR(wreq)) return wreq; - kenter("R=%x", wreq->debug_id); + _enter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) @@ -122,6 +122,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, wreq->io_streams[1].transferred = LONG_MAX; if (fscache_resources_valid(&wreq->cache_resources)) { wreq->io_streams[1].avail = true; + wreq->io_streams[1].active = true; wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq; wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write; } @@ -159,7 +160,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, subreq->max_nr_segs = INT_MAX; subreq->stream_nr = stream->stream_nr; - kenter("R=%x[%x]", wreq->debug_id, subreq->debug_index); + _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), @@ -215,7 +216,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, { struct netfs_io_request *wreq = subreq->rreq; - kenter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); + _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) return netfs_write_subrequest_terminated(subreq, subreq->error, false); @@ -272,11 +273,11 @@ int netfs_advance_write(struct netfs_io_request *wreq, size_t part; if (!stream->avail) { - kleave("no write"); + _leave("no write"); return len; } - kenter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); + _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); if (subreq && start != subreq->start + subreq->len) { netfs_issue_write(wreq, stream); @@ -288,7 +289,7 @@ int netfs_advance_write(struct netfs_io_request *wreq, subreq = stream->construct; part = min(subreq->max_len - subreq->len, len); - kdebug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); + _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); subreq->len += part; subreq->nr_segs++; @@ -319,7 +320,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, bool to_eof = false, streamw = false; bool debug = false; - kenter(""); + _enter(""); /* netfs_perform_write() may shift i_size around the page or from out * of the page to beyond it, but cannot move i_size into or through the @@ -329,7 +330,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, if (fpos >= i_size) { /* mmap beyond eof. */ - kdebug("beyond eof"); + _debug("beyond eof"); folio_start_writeback(folio); folio_unlock(folio); wreq->nr_group_rel += netfs_folio_written_back(folio); @@ -363,7 +364,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, } flen -= foff; - kdebug("folio %zx %zx %zx", foff, flen, fsize); + _debug("folio %zx %zx %zx", foff, flen, fsize); /* Deal with discontinuities in the stream of dirty pages. These can * arise from a number of sources: @@ -487,7 +488,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, for (int s = 0; s < NR_IO_STREAMS; s++) netfs_issue_write(wreq, &wreq->io_streams[s]); - kleave(" = 0"); + _leave(" = 0"); return 0; } @@ -522,7 +523,7 @@ int netfs_writepages(struct address_space *mapping, netfs_stat(&netfs_n_wh_writepages); do { - kdebug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); + _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); /* It appears we don't have to handle cyclic writeback wrapping. */ WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); @@ -546,14 +547,14 @@ int netfs_writepages(struct address_space *mapping, mutex_unlock(&ictx->wb_lock); netfs_put_request(wreq, false, netfs_rreq_trace_put_return); - kleave(" = %d", error); + _leave(" = %d", error); return error; couldnt_start: netfs_kill_dirty_pages(mapping, wbc, folio); out: mutex_unlock(&ictx->wb_lock); - kleave(" = %d", error); + _leave(" = %d", error); return error; } EXPORT_SYMBOL(netfs_writepages); @@ -590,7 +591,7 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c struct folio *folio, size_t copied, bool to_page_end, struct folio **writethrough_cache) { - kenter("R=%x ic=%zu ws=%u cp=%zu tp=%u", + _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end); if (!*writethrough_cache) { @@ -624,7 +625,7 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr struct netfs_inode *ictx = netfs_inode(wreq->inode); int ret; - kenter("R=%x", wreq->debug_id); + _enter("R=%x", wreq->debug_id); if (writethrough_cache) netfs_write_folio(wreq, wbc, writethrough_cache); @@ -657,7 +658,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t loff_t start = wreq->start; int error = 0; - kenter("%zx", len); + _enter("%zx", len); if (wreq->origin == NETFS_DIO_WRITE) inode_dio_begin(wreq->inode); @@ -665,7 +666,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t while (len) { // TODO: Prepare content encryption - kdebug("unbuffered %zx", len); + _debug("unbuffered %zx", len); part = netfs_advance_write(wreq, upload, start, len, false); start += part; len -= part; @@ -684,6 +685,6 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t if (list_empty(&upload->subrequests)) netfs_wake_write_collector(wreq, false); - kleave(" = %d", error); + _leave(" = %d", error); return error; } diff --git a/fs/pidfs.c b/fs/pidfs.c index c9cb14181def..7ffdc88dfb52 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -119,7 +119,7 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; struct pid *pid = pidfd_pid(file); - struct ns_common *ns_common; + struct ns_common *ns_common = NULL; if (arg) return -EINVAL; @@ -146,52 +146,73 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { /* Namespaces that hang of nsproxy. */ case PIDFD_GET_CGROUP_NAMESPACE: - get_cgroup_ns(nsp->cgroup_ns); - ns_common = to_ns_common(nsp->cgroup_ns); + if (IS_ENABLED(CONFIG_CGROUPS)) { + get_cgroup_ns(nsp->cgroup_ns); + ns_common = to_ns_common(nsp->cgroup_ns); + } break; case PIDFD_GET_IPC_NAMESPACE: - get_ipc_ns(nsp->ipc_ns); - ns_common = to_ns_common(nsp->ipc_ns); + if (IS_ENABLED(CONFIG_IPC_NS)) { + get_ipc_ns(nsp->ipc_ns); + ns_common = to_ns_common(nsp->ipc_ns); + } break; case PIDFD_GET_MNT_NAMESPACE: get_mnt_ns(nsp->mnt_ns); ns_common = to_ns_common(nsp->mnt_ns); break; case PIDFD_GET_NET_NAMESPACE: - ns_common = to_ns_common(nsp->net_ns); - get_net_ns(ns_common); + if (IS_ENABLED(CONFIG_NET_NS)) { + ns_common = to_ns_common(nsp->net_ns); + get_net_ns(ns_common); + } break; case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: - get_pid_ns(nsp->pid_ns_for_children); - ns_common = to_ns_common(nsp->pid_ns_for_children); + if (IS_ENABLED(CONFIG_PID_NS)) { + get_pid_ns(nsp->pid_ns_for_children); + ns_common = to_ns_common(nsp->pid_ns_for_children); + } break; case PIDFD_GET_TIME_NAMESPACE: - get_time_ns(nsp->time_ns); - ns_common = to_ns_common(nsp->time_ns); + if (IS_ENABLED(CONFIG_TIME_NS)) { + get_time_ns(nsp->time_ns); + ns_common = to_ns_common(nsp->time_ns); + } break; case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: - get_time_ns(nsp->time_ns_for_children); - ns_common = to_ns_common(nsp->time_ns_for_children); + if (IS_ENABLED(CONFIG_TIME_NS)) { + get_time_ns(nsp->time_ns_for_children); + ns_common = to_ns_common(nsp->time_ns_for_children); + } break; case PIDFD_GET_UTS_NAMESPACE: - get_uts_ns(nsp->uts_ns); - ns_common = to_ns_common(nsp->uts_ns); + if (IS_ENABLED(CONFIG_UTS_NS)) { + get_uts_ns(nsp->uts_ns); + ns_common = to_ns_common(nsp->uts_ns); + } break; /* Namespaces that don't hang of nsproxy. */ case PIDFD_GET_USER_NAMESPACE: - rcu_read_lock(); - ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); - rcu_read_unlock(); + if (IS_ENABLED(CONFIG_USER_NS)) { + rcu_read_lock(); + ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); + rcu_read_unlock(); + } break; case PIDFD_GET_PID_NAMESPACE: - rcu_read_lock(); - ns_common = to_ns_common(get_pid_ns(task_active_pid_ns(task))); - rcu_read_unlock(); + if (IS_ENABLED(CONFIG_PID_NS)) { + rcu_read_lock(); + ns_common = to_ns_common( get_pid_ns(task_active_pid_ns(task))); + rcu_read_unlock(); + } break; default: return -ENOIOCTLCMD; } + if (!ns_common) + return -EOPNOTSUPP; + /* open_namespace() unconditionally consumes the reference */ return open_namespace(ns_common); } diff --git a/fs/xattr.c b/fs/xattr.c index f8b643f91a98..7672ce5486c5 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -630,10 +630,9 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, ctx->kvalue, ctx->size, ctx->flags); } -static long -setxattr(struct mnt_idmap *idmap, struct dentry *d, - const char __user *name, const void __user *value, size_t size, - int flags) +static int path_setxattr(const char __user *pathname, + const char __user *name, const void __user *value, + size_t size, int flags, unsigned int lookup_flags) { struct xattr_name kname; struct xattr_ctx ctx = { @@ -643,33 +642,20 @@ setxattr(struct mnt_idmap *idmap, struct dentry *d, .kname = &kname, .flags = flags, }; + struct path path; int error; error = setxattr_copy(name, &ctx); if (error) return error; - error = do_setxattr(idmap, d, &ctx); - - kvfree(ctx.kvalue); - return error; -} - -static int path_setxattr(const char __user *pathname, - const char __user *name, const void __user *value, - size_t size, int flags, unsigned int lookup_flags) -{ - struct path path; - int error; - retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) - return error; + goto out; error = mnt_want_write(path.mnt); if (!error) { - error = setxattr(mnt_idmap(path.mnt), path.dentry, name, - value, size, flags); + error = do_setxattr(mnt_idmap(path.mnt), path.dentry, &ctx); mnt_drop_write(path.mnt); } path_put(&path); @@ -677,6 +663,9 @@ retry: lookup_flags |= LOOKUP_REVAL; goto retry; } + +out: + kvfree(ctx.kvalue); return error; } @@ -697,20 +686,32 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { - struct fd f = fdget(fd); - int error = -EBADF; + struct xattr_name kname; + struct xattr_ctx ctx = { + .cvalue = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = flags, + }; + int error; + CLASS(fd, f)(fd); if (!f.file) - return error; + return -EBADF; + audit_file(f.file); + error = setxattr_copy(name, &ctx); + if (error) + return error; + error = mnt_want_write_file(f.file); if (!error) { - error = setxattr(file_mnt_idmap(f.file), - f.file->f_path.dentry, name, - value, size, flags); + error = do_setxattr(file_mnt_idmap(f.file), + f.file->f_path.dentry, &ctx); mnt_drop_write_file(f.file); } - fdput(f); + kvfree(ctx.kvalue); return error; } @@ -899,9 +900,17 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) * Extended attribute REMOVE operations */ static long -removexattr(struct mnt_idmap *idmap, struct dentry *d, - const char __user *name) +removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) { + if (is_posix_acl_xattr(name)) + return vfs_remove_acl(idmap, d, name); + return vfs_removexattr(idmap, d, name); +} + +static int path_removexattr(const char __user *pathname, + const char __user *name, unsigned int lookup_flags) +{ + struct path path; int error; char kname[XATTR_NAME_MAX + 1]; @@ -910,25 +919,13 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, error = -ERANGE; if (error < 0) return error; - - if (is_posix_acl_xattr(kname)) - return vfs_remove_acl(idmap, d, kname); - - return vfs_removexattr(idmap, d, kname); -} - -static int path_removexattr(const char __user *pathname, - const char __user *name, unsigned int lookup_flags) -{ - struct path path; - int error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(mnt_idmap(path.mnt), path.dentry, name); + error = removexattr(mnt_idmap(path.mnt), path.dentry, kname); mnt_drop_write(path.mnt); } path_put(&path); @@ -954,15 +951,23 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { struct fd f = fdget(fd); + char kname[XATTR_NAME_MAX + 1]; int error = -EBADF; if (!f.file) return error; audit_file(f.file); + + error = strncpy_from_user(kname, name, sizeof(kname)); + if (error == 0 || error == sizeof(kname)) + error = -ERANGE; + if (error < 0) + return error; + error = mnt_want_write_file(f.file); if (!error) { error = removexattr(file_mnt_idmap(f.file), - f.file->f_path.dentry, name); + f.file->f_path.dentry, kname); mnt_drop_write_file(f.file); } fdput(f); diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 47746b0c6acd..7c2a4349170a 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -16,11 +16,56 @@ #include #include #include +#include #include "pidfd.h" #include "../clone3/clone3_selftests.h" #include "../kselftest_harness.h" +#ifndef PIDFS_IOCTL_MAGIC +#define PIDFS_IOCTL_MAGIC 0xFF +#endif + +#ifndef PIDFD_GET_CGROUP_NAMESPACE +#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1) +#endif + +#ifndef PIDFD_GET_IPC_NAMESPACE +#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2) +#endif + +#ifndef PIDFD_GET_MNT_NAMESPACE +#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3) +#endif + +#ifndef PIDFD_GET_NET_NAMESPACE +#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4) +#endif + +#ifndef PIDFD_GET_PID_NAMESPACE +#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5) +#endif + +#ifndef PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE +#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6) +#endif + +#ifndef PIDFD_GET_TIME_NAMESPACE +#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7) +#endif + +#ifndef PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE +#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8) +#endif + +#ifndef PIDFD_GET_USER_NAMESPACE +#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9) +#endif + +#ifndef PIDFD_GET_UTS_NAMESPACE +#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10) +#endif + enum { PIDFD_NS_USER, PIDFD_NS_MNT, @@ -31,22 +76,25 @@ enum { PIDFD_NS_CGROUP, PIDFD_NS_PIDCLD, PIDFD_NS_TIME, + PIDFD_NS_TIMECLD, PIDFD_NS_MAX }; const struct ns_info { const char *name; int flag; + unsigned int pidfd_ioctl; } ns_info[] = { - [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, }, - [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, }, - [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, }, - [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, }, - [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, }, - [PIDFD_NS_NET] = { "net", CLONE_NEWNET, }, - [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, }, - [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, }, - [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, }, + [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, PIDFD_GET_USER_NAMESPACE, }, + [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, PIDFD_GET_MNT_NAMESPACE, }, + [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, PIDFD_GET_PID_NAMESPACE, }, + [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, PIDFD_GET_UTS_NAMESPACE, }, + [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, PIDFD_GET_IPC_NAMESPACE, }, + [PIDFD_NS_NET] = { "net", CLONE_NEWNET, PIDFD_GET_NET_NAMESPACE, }, + [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, PIDFD_GET_CGROUP_NAMESPACE, }, + [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, PIDFD_GET_TIME_NAMESPACE, }, + [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE, }, + [PIDFD_NS_TIMECLD] = { "time_for_children", 0, PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE, }, }; FIXTURE(current_nsset) @@ -54,6 +102,7 @@ FIXTURE(current_nsset) pid_t pid; int pidfd; int nsfds[PIDFD_NS_MAX]; + int child_pidfd_derived_nsfds[PIDFD_NS_MAX]; pid_t child_pid_exited; int child_pidfd_exited; @@ -61,10 +110,12 @@ FIXTURE(current_nsset) pid_t child_pid1; int child_pidfd1; int child_nsfds1[PIDFD_NS_MAX]; + int child_pidfd_derived_nsfds1[PIDFD_NS_MAX]; pid_t child_pid2; int child_pidfd2; int child_nsfds2[PIDFD_NS_MAX]; + int child_pidfd_derived_nsfds2[PIDFD_NS_MAX]; }; static int sys_waitid(int which, pid_t pid, int options) @@ -128,9 +179,12 @@ FIXTURE_SETUP(current_nsset) char c; for (i = 0; i < PIDFD_NS_MAX; i++) { - self->nsfds[i] = -EBADF; - self->child_nsfds1[i] = -EBADF; - self->child_nsfds2[i] = -EBADF; + self->nsfds[i] = -EBADF; + self->child_nsfds1[i] = -EBADF; + self->child_nsfds2[i] = -EBADF; + self->child_pidfd_derived_nsfds[i] = -EBADF; + self->child_pidfd_derived_nsfds1[i] = -EBADF; + self->child_pidfd_derived_nsfds2[i] = -EBADF; } proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC); @@ -139,6 +193,11 @@ FIXTURE_SETUP(current_nsset) } self->pid = getpid(); + self->pidfd = sys_pidfd_open(self->pid, 0); + EXPECT_GT(self->pidfd, 0) { + TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + } + for (i = 0; i < PIDFD_NS_MAX; i++) { const struct ns_info *info = &ns_info[i]; self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); @@ -148,20 +207,27 @@ FIXTURE_SETUP(current_nsset) info->name, self->pid); } } - } - self->pidfd = sys_pidfd_open(self->pid, 0); - EXPECT_GT(self->pidfd, 0) { - TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + self->child_pidfd_derived_nsfds[i] = ioctl(self->pidfd, info->pidfd_ioctl, 0); + if (self->child_pidfd_derived_nsfds[i] < 0) { + EXPECT_EQ(errno, EOPNOTSUPP) { + TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d", + info->name, self->pid); + } + } } /* Create task that exits right away. */ - self->child_pid_exited = create_child(&self->child_pidfd_exited, - CLONE_NEWUSER | CLONE_NEWNET); + self->child_pid_exited = create_child(&self->child_pidfd_exited, 0); EXPECT_GE(self->child_pid_exited, 0); - if (self->child_pid_exited == 0) + if (self->child_pid_exited == 0) { + if (self->nsfds[PIDFD_NS_USER] >= 0 && unshare(CLONE_NEWUSER) < 0) + _exit(EXIT_FAILURE); + if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0) + _exit(EXIT_FAILURE); _exit(EXIT_SUCCESS); + } ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0); @@ -174,18 +240,43 @@ FIXTURE_SETUP(current_nsset) EXPECT_EQ(ret, 0); /* Create tasks that will be stopped. */ - self->child_pid1 = create_child(&self->child_pidfd1, - CLONE_NEWUSER | CLONE_NEWNS | - CLONE_NEWCGROUP | CLONE_NEWIPC | - CLONE_NEWUTS | CLONE_NEWPID | - CLONE_NEWNET); + if (self->nsfds[PIDFD_NS_USER] >= 0 && self->nsfds[PIDFD_NS_PID] >= 0) + self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER | CLONE_NEWPID); + else if (self->nsfds[PIDFD_NS_PID] >= 0) + self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWPID); + else if (self->nsfds[PIDFD_NS_USER] >= 0) + self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER); + else + self->child_pid1 = create_child(&self->child_pidfd1, 0); EXPECT_GE(self->child_pid1, 0); if (self->child_pid1 == 0) { close(ipc_sockets[0]); - if (!switch_timens()) + if (self->nsfds[PIDFD_NS_MNT] >= 0 && unshare(CLONE_NEWNS) < 0) { + TH_LOG("%m - Failed to unshare mount namespace for process %d", self->pid); _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_CGROUP] >= 0 && unshare(CLONE_NEWCGROUP) < 0) { + TH_LOG("%m - Failed to unshare cgroup namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_IPC] >= 0 && unshare(CLONE_NEWIPC) < 0) { + TH_LOG("%m - Failed to unshare ipc namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_UTS] >= 0 && unshare(CLONE_NEWUTS) < 0) { + TH_LOG("%m - Failed to unshare uts namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0) { + TH_LOG("%m - Failed to unshare net namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_TIME] >= 0 && !switch_timens()) { + TH_LOG("%m - Failed to unshare time namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } if (write_nointr(ipc_sockets[1], "1", 1) < 0) _exit(EXIT_FAILURE); @@ -203,18 +294,43 @@ FIXTURE_SETUP(current_nsset) ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); EXPECT_EQ(ret, 0); - self->child_pid2 = create_child(&self->child_pidfd2, - CLONE_NEWUSER | CLONE_NEWNS | - CLONE_NEWCGROUP | CLONE_NEWIPC | - CLONE_NEWUTS | CLONE_NEWPID | - CLONE_NEWNET); + if (self->nsfds[PIDFD_NS_USER] >= 0 && self->nsfds[PIDFD_NS_PID] >= 0) + self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID); + else if (self->nsfds[PIDFD_NS_PID] >= 0) + self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWPID); + else if (self->nsfds[PIDFD_NS_USER] >= 0) + self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER); + else + self->child_pid2 = create_child(&self->child_pidfd2, 0); EXPECT_GE(self->child_pid2, 0); if (self->child_pid2 == 0) { close(ipc_sockets[0]); - if (!switch_timens()) + if (self->nsfds[PIDFD_NS_MNT] >= 0 && unshare(CLONE_NEWNS) < 0) { + TH_LOG("%m - Failed to unshare mount namespace for process %d", self->pid); _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_CGROUP] >= 0 && unshare(CLONE_NEWCGROUP) < 0) { + TH_LOG("%m - Failed to unshare cgroup namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_IPC] >= 0 && unshare(CLONE_NEWIPC) < 0) { + TH_LOG("%m - Failed to unshare ipc namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_UTS] >= 0 && unshare(CLONE_NEWUTS) < 0) { + TH_LOG("%m - Failed to unshare uts namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_NET] >= 0 && unshare(CLONE_NEWNET) < 0) { + TH_LOG("%m - Failed to unshare net namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } + if (self->nsfds[PIDFD_NS_TIME] >= 0 && !switch_timens()) { + TH_LOG("%m - Failed to unshare time namespace for process %d", self->pid); + _exit(EXIT_FAILURE); + } if (write_nointr(ipc_sockets[1], "1", 1) < 0) _exit(EXIT_FAILURE); @@ -267,6 +383,22 @@ FIXTURE_SETUP(current_nsset) info->name, self->child_pid1); } } + + self->child_pidfd_derived_nsfds1[i] = ioctl(self->child_pidfd1, info->pidfd_ioctl, 0); + if (self->child_pidfd_derived_nsfds1[i] < 0) { + EXPECT_EQ(errno, EOPNOTSUPP) { + TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d", + info->name, self->child_pid1); + } + } + + self->child_pidfd_derived_nsfds2[i] = ioctl(self->child_pidfd2, info->pidfd_ioctl, 0); + if (self->child_pidfd_derived_nsfds2[i] < 0) { + EXPECT_EQ(errno, EOPNOTSUPP) { + TH_LOG("%m - Failed to derive %s namespace from pidfd of process %d", + info->name, self->child_pid2); + } + } } close(proc_fd); @@ -288,6 +420,12 @@ FIXTURE_TEARDOWN(current_nsset) close(self->child_nsfds1[i]); if (self->child_nsfds2[i] >= 0) close(self->child_nsfds2[i]); + if (self->child_pidfd_derived_nsfds[i] >= 0) + close(self->child_pidfd_derived_nsfds[i]); + if (self->child_pidfd_derived_nsfds1[i] >= 0) + close(self->child_pidfd_derived_nsfds1[i]); + if (self->child_pidfd_derived_nsfds2[i] >= 0) + close(self->child_pidfd_derived_nsfds2[i]); } if (self->child_pidfd1 >= 0) @@ -446,6 +584,42 @@ TEST_F(current_nsset, nsfd_incremental_setns) } } +TEST_F(current_nsset, pidfd_derived_nsfd_incremental_setns) +{ + int i; + pid_t pid; + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_pidfd_derived_nsfds1[i] < 0) + continue; + + if (info->flag) { + ASSERT_EQ(setns(self->child_pidfd_derived_nsfds1[i], info->flag), 0) { + TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_pidfd_derived_nsfds1[i]); + } + } + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->child_pidfd_derived_nsfds[i]; + else + nsfd = self->child_pidfd_derived_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_pidfd_derived_nsfds1[i]); + } + TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, self->child_pidfd_derived_nsfds1[i]); + } +} + TEST_F(current_nsset, pidfd_one_shot_setns) { unsigned flags = 0; @@ -542,6 +716,28 @@ TEST_F(current_nsset, no_foul_play) info->name, self->child_pid2, self->child_nsfds2[i]); } + + /* + * Can't setns to a user namespace outside of our hierarchy since we + * don't have caps in there and didn't create it. That means that under + * no circumstances should we be able to setns to any of the other + * ones since they aren't owned by our user namespace. + */ + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_pidfd_derived_nsfds2[i] < 0 || !info->flag) + continue; + + ASSERT_NE(setns(self->child_pidfd_derived_nsfds2[i], info->flag), 0) { + TH_LOG("Managed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_pidfd_derived_nsfds2[i]); + } + TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_pidfd_derived_nsfds2[i]); + } } TEST(setns_einval)