diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 88be26e5866b..2bca7edfc9f6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -392,6 +392,7 @@ struct fuse_writepage_args { struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; + struct fuse_sync_bucket *bucket; }; static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, @@ -1611,6 +1612,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; int i; + if (wpa->bucket) + fuse_sync_bucket_dec(wpa->bucket); + for (i = 0; i < ap->num_pages; i++) __free_page(ap->pages[i]); @@ -1874,6 +1878,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) } +static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) +{ + if (!fc->sync_fs) + return; + + rcu_read_lock(); + /* Prevent resurrection of dead bucket in unlikely race with syncfs */ + do { + wpa->bucket = rcu_dereference(fc->curr_bucket); + } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); + rcu_read_unlock(); +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1901,6 +1919,7 @@ static int fuse_writepage_locked(struct page *page) if (!wpa->ia.ff) goto err_nofile; + fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); @@ -2151,6 +2170,8 @@ static int fuse_writepages_fill(struct page *page, __free_page(tmp_page); goto out_unlock; } + fuse_writepage_add_to_bucket(fc, wpa); + data->max_pages = 1; ap = &wpa->ia.ap; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a78480933ebe..f166e24dd48b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -516,6 +516,13 @@ struct fuse_fs_context { void **fudptr; }; +struct fuse_sync_bucket { + /* count is a possible scalability bottleneck */ + atomic_t count; + wait_queue_head_t waitq; + struct rcu_head rcu; +}; + /** * A Fuse connection. * @@ -808,6 +815,9 @@ struct fuse_conn { /** List of filesystems using this connection */ struct list_head mounts; + + /* New writepages go into this bucket */ + struct fuse_sync_bucket __rcu *curr_bucket; }; /* @@ -911,6 +921,15 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, descs[i].length = PAGE_SIZE - descs[i].offset; } +static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) +{ + /* Need RCU protection to prevent use after free after the decrement */ + rcu_read_lock(); + if (atomic_dec_and_test(&bucket->count)) + wake_up(&bucket->waitq); + rcu_read_unlock(); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a3e7fb484938..2187211893ff 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) +{ + struct fuse_sync_bucket *bucket; + + bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); + if (bucket) { + init_waitqueue_head(&bucket->waitq); + /* Initial active count */ + atomic_set(&bucket->count, 1); + } + return bucket; +} + +static void fuse_sync_fs_writes(struct fuse_conn *fc) +{ + struct fuse_sync_bucket *bucket, *new_bucket; + int count; + + new_bucket = fuse_sync_bucket_alloc(); + spin_lock(&fc->lock); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + count = atomic_read(&bucket->count); + WARN_ON(count < 1); + /* No outstanding writes? */ + if (count == 1) { + spin_unlock(&fc->lock); + kfree(new_bucket); + return; + } + + /* + * Completion of new bucket depends on completion of this bucket, so add + * one more count. + */ + atomic_inc(&new_bucket->count); + rcu_assign_pointer(fc->curr_bucket, new_bucket); + spin_unlock(&fc->lock); + /* + * Drop initial active count. At this point if all writes in this and + * ancestor buckets complete, the count will go to zero and this task + * will be woken up. + */ + atomic_dec(&bucket->count); + + wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); + + /* Drop temp count on descendant bucket */ + fuse_sync_bucket_dec(new_bucket); + kfree_rcu(bucket, rcu); +} + static int fuse_sync_fs(struct super_block *sb, int wait) { struct fuse_mount *fm = get_fuse_mount_super(sb); @@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait) if (!fc->sync_fs) return 0; + fuse_sync_fs_writes(fc); + memset(&inarg, 0, sizeof(inarg)); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); @@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); @@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); + } fc->release(fc); } } @@ -1418,6 +1477,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; + rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); fuse_sb_defaults(sb); if (ctx->is_bdev) {