zonefs changes for 5.19-rc1

This set of patches improve zonefs open sequential file accounting and
 adds accounting for active sequential files to allow the user to handle
 the maximum number of active zones of an NVMe ZNS drive. sysfs
 attributes for both open and active sequential files are also added to
 facilitate access to this information from applications without
 resorting to inspecting the block device limits.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQSRPv8tYSvhwAzJdzjdoc3SxdoYdgUCYosTQQAKCRDdoc3SxdoY
 dqUWAQDGKoSkyRAPJAmuQXYOuOJTLu0b8DSfvyPopFLfKXpPHAEAg995JNTLUs0G
 R3m7lH6GK+OSBWhZ/Z5HOND3QS9BhgM=
 =hvqx
 -----END PGP SIGNATURE-----

Merge tag 'zonefs-5.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs

Pull zonefs updates from Damien Le Moal:
 "This improves zonefs open sequential file accounting and adds
  accounting for active sequential files to allow the user to handle the
  maximum number of active zones of an NVMe ZNS drive.

  sysfs attributes for both open and active sequential files are also
  added to facilitate access to this information from applications
  without resorting to inspecting the block device limits"

* tag 'zonefs-5.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs:
  documentation: zonefs: Document sysfs attributes
  documentation: zonefs: Cleanup the mount options section
  zonefs: Add active seq file accounting
  zonefs: Export open zone resource information through sysfs
  zonefs: Always do seq file write open accounting
  zonefs: Rename super block information fields
  zonefs: Fix management of open zones
  zonefs: Clear inode information flags on inode creation
This commit is contained in:
Linus Torvalds 2022-05-23 14:36:45 -07:00
commit 140e40e39a
5 changed files with 344 additions and 53 deletions

View File

@ -306,8 +306,15 @@ Further notes:
Mount options
-------------
zonefs define the "errors=<behavior>" mount option to allow the user to specify
zonefs behavior in response to I/O errors, inode size inconsistencies or zone
zonefs defines several mount options:
* errors=<behavior>
* explicit-open
"errors=<behavior>" option
~~~~~~~~~~~~~~~~~~~~~~~~~~
The "errors=<behavior>" option mount option allows the user to specify zonefs
behavior in response to I/O errors, inode size inconsistencies or zone
condition changes. The defined behaviors are as follow:
* remount-ro (default)
@ -326,6 +333,9 @@ discover the amount of data that has been written to the zone. In the case of a
read-only zone discovered at run-time, as indicated in the previous section.
The size of the zone file is left unchanged from its last updated value.
"explicit-open" option
~~~~~~~~~~~~~~~~~~~~~~
A zoned block device (e.g. an NVMe Zoned Namespace device) may have limits on
the number of zones that can be active, that is, zones that are in the
implicit open, explicit open or closed conditions. This potential limitation
@ -341,6 +351,44 @@ guaranteed that write requests can be processed. Conversely, the
to the device on the last close() of a zone file if the zone is not full nor
empty.
Runtime sysfs attributes
------------------------
zonefs defines several sysfs attributes for mounted devices. All attributes
are user readable and can be found in the directory /sys/fs/zonefs/<dev>/,
where <dev> is the name of the mounted zoned block device.
The attributes defined are as follows.
* **max_wro_seq_files**: This attribute reports the maximum number of
sequential zone files that can be open for writing. This number corresponds
to the maximum number of explicitly or implicitly open zones that the device
supports. A value of 0 means that the device has no limit and that any zone
(any file) can be open for writing and written at any time, regardless of the
state of other zones. When the *explicit-open* mount option is used, zonefs
will fail any open() system call requesting to open a sequential zone file for
writing when the number of sequential zone files already open for writing has
reached the *max_wro_seq_files* limit.
* **nr_wro_seq_files**: This attribute reports the current number of sequential
zone files open for writing. When the "explicit-open" mount option is used,
this number can never exceed *max_wro_seq_files*. If the *explicit-open*
mount option is not used, the reported number can be greater than
*max_wro_seq_files*. In such case, it is the responsibility of the
application to not write simultaneously more than *max_wro_seq_files*
sequential zone files. Failure to do so can result in write errors.
* **max_active_seq_files**: This attribute reports the maximum number of
sequential zone files that are in an active state, that is, sequential zone
files that are partially writen (not empty nor full) or that have a zone that
is explicitly open (which happens only if the *explicit-open* mount option is
used). This number is always equal to the maximum number of active zones that
the device supports. A value of 0 means that the mounted device has no limit
on the number of sequential zone files that can be active.
* **nr_active_seq_files**: This attributes reports the current number of
sequential zone files that are active. If *max_active_seq_files* is not 0,
then the value of *nr_active_seq_files* can never exceed the value of
*nr_active_seq_files*, regardless of the use of the *explicit-open* mount
option.
Zonefs User Space Tools
=======================

View File

@ -3,4 +3,4 @@ ccflags-y += -I$(src)
obj-$(CONFIG_ZONEFS_FS) += zonefs.o
zonefs-y := super.o
zonefs-y := super.o sysfs.o

View File

@ -27,6 +27,39 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
/*
* Manage the active zone count. Called with zi->i_truncate_mutex held.
*/
static void zonefs_account_active(struct inode *inode)
{
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
lockdep_assert_held(&zi->i_truncate_mutex);
if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
return;
/*
* If the zone is active, that is, if it is explicitly open or
* partially written, check if it was already accounted as active.
*/
if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
(zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
zi->i_flags |= ZONEFS_ZONE_ACTIVE;
atomic_inc(&sbi->s_active_seq_files);
}
return;
}
/* The zone is not active. If it was, update the active count */
if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
atomic_dec(&sbi->s_active_seq_files);
}
}
static inline int zonefs_zone_mgmt(struct inode *inode,
enum req_opf op)
{
@ -68,8 +101,13 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
* A full zone is no longer open/active and does not need
* explicit closing.
*/
if (isize >= zi->i_max_size)
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
if (isize >= zi->i_max_size) {
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
atomic_dec(&sbi->s_active_seq_files);
zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
}
}
static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@ -397,6 +435,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_update_stats(inode, data_size);
zonefs_i_size_write(inode, data_size);
zi->i_wpoffset = data_size;
zonefs_account_active(inode);
return 0;
}
@ -508,6 +547,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
zonefs_update_stats(inode, isize);
truncate_setsize(inode, isize);
zi->i_wpoffset = isize;
zonefs_account_active(inode);
unlock:
mutex_unlock(&zi->i_truncate_mutex);
@ -865,8 +905,15 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
count = ret;
/*
* Update the zone write pointer offset assuming the write
* operation succeeded. If it did not, the error recovery path
* will correct it. Also do active seq file accounting.
*/
mutex_lock(&zi->i_truncate_mutex);
zi->i_wpoffset += count;
zonefs_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@ -1008,13 +1055,13 @@ inode_unlock:
return ret;
}
static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
/*
* Write open accounting is done only for sequential files.
*/
static inline bool zonefs_seq_file_need_wro(struct inode *inode,
struct file *file)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
return false;
if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
return false;
@ -1025,28 +1072,34 @@ static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *fi
return true;
}
static int zonefs_open_zone(struct inode *inode)
static int zonefs_seq_file_write_open(struct inode *inode)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
int ret = 0;
mutex_lock(&zi->i_truncate_mutex);
if (!zi->i_wr_refcnt) {
if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
atomic_dec(&sbi->s_open_zones);
ret = -EBUSY;
goto unlock;
}
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
if (i_size_read(inode) < zi->i_max_size) {
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
if (ret) {
atomic_dec(&sbi->s_open_zones);
if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
if (wro > sbi->s_max_wro_seq_files) {
atomic_dec(&sbi->s_wro_seq_files);
ret = -EBUSY;
goto unlock;
}
zi->i_flags |= ZONEFS_ZONE_OPEN;
if (i_size_read(inode) < zi->i_max_size) {
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
if (ret) {
atomic_dec(&sbi->s_wro_seq_files);
goto unlock;
}
zi->i_flags |= ZONEFS_ZONE_OPEN;
zonefs_account_active(inode);
}
}
}
@ -1066,30 +1119,31 @@ static int zonefs_file_open(struct inode *inode, struct file *file)
if (ret)
return ret;
if (zonefs_file_use_exp_open(inode, file))
return zonefs_open_zone(inode);
if (zonefs_seq_file_need_wro(inode, file))
return zonefs_seq_file_write_open(inode);
return 0;
}
static void zonefs_close_zone(struct inode *inode)
static void zonefs_seq_file_write_close(struct inode *inode)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
int ret = 0;
mutex_lock(&zi->i_truncate_mutex);
zi->i_wr_refcnt--;
if (!zi->i_wr_refcnt) {
struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
struct super_block *sb = inode->i_sb;
/*
* If the file zone is full, it is not open anymore and we only
* need to decrement the open count.
*/
if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
goto dec;
if (zi->i_wr_refcnt)
goto unlock;
/*
* The file zone may not be open anymore (e.g. the file was truncated to
* its maximum size or it was fully written). For this case, we only
* need to decrement the write open count.
*/
if (zi->i_flags & ZONEFS_ZONE_OPEN) {
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
if (ret) {
__zonefs_io_error(inode, false);
@ -1101,14 +1155,23 @@ static void zonefs_close_zone(struct inode *inode)
*/
if (zi->i_flags & ZONEFS_ZONE_OPEN &&
!(sb->s_flags & SB_RDONLY)) {
zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
zonefs_warn(sb,
"closing zone at %llu failed %d\n",
zi->i_zsector, ret);
zonefs_warn(sb,
"remounting filesystem read-only\n");
sb->s_flags |= SB_RDONLY;
}
goto unlock;
}
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
dec:
atomic_dec(&sbi->s_open_zones);
zonefs_account_active(inode);
}
atomic_dec(&sbi->s_wro_seq_files);
unlock:
mutex_unlock(&zi->i_truncate_mutex);
}
@ -1120,8 +1183,8 @@ static int zonefs_file_release(struct inode *inode, struct file *file)
* the zone has gone offline or read-only). Make sure we don't fail the
* close(2) for user-space.
*/
if (zonefs_file_use_exp_open(inode, file))
zonefs_close_zone(inode);
if (zonefs_seq_file_need_wro(inode, file))
zonefs_seq_file_write_close(inode);
return 0;
}
@ -1310,7 +1373,7 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
int ret = 0;
int ret;
inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
inode->i_mode = S_IFREG | sbi->s_perm;
@ -1336,6 +1399,8 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
mutex_lock(&zi->i_truncate_mutex);
/*
* For sequential zones, make sure that any open zone is closed first
* to ensure that the initial number of open zones is 0, in sync with
@ -1345,12 +1410,17 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
if (type == ZONEFS_ZTYPE_SEQ &&
(zone->cond == BLK_ZONE_COND_IMP_OPEN ||
zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
mutex_lock(&zi->i_truncate_mutex);
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
mutex_unlock(&zi->i_truncate_mutex);
if (ret)
goto unlock;
}
return ret;
zonefs_account_active(inode);
unlock:
mutex_unlock(&zi->i_truncate_mutex);
return 0;
}
static struct dentry *zonefs_create_inode(struct dentry *parent,
@ -1687,14 +1757,18 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_gid = GLOBAL_ROOT_GID;
sbi->s_perm = 0640;
sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
atomic_set(&sbi->s_open_zones, 0);
if (!sbi->s_max_open_zones &&
atomic_set(&sbi->s_wro_seq_files, 0);
sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
if (!sbi->s_max_wro_seq_files &&
sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
}
atomic_set(&sbi->s_active_seq_files, 0);
sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
ret = zonefs_read_super(sb);
if (ret)
return ret;
@ -1709,6 +1783,10 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
if (ret)
goto cleanup;
ret = zonefs_sysfs_register(sb);
if (ret)
goto cleanup;
zonefs_info(sb, "Mounting %u zones",
blkdev_nr_zones(sb->s_bdev->bd_disk));
@ -1754,6 +1832,8 @@ static void zonefs_kill_super(struct super_block *sb)
if (sb->s_root)
d_genocide(sb->s_root);
zonefs_sysfs_unregister(sb);
kill_block_super(sb);
kfree(sbi);
}
@ -1801,16 +1881,26 @@ static int __init zonefs_init(void)
return ret;
ret = register_filesystem(&zonefs_type);
if (ret) {
zonefs_destroy_inodecache();
return ret;
}
if (ret)
goto destroy_inodecache;
ret = zonefs_sysfs_init();
if (ret)
goto unregister_fs;
return 0;
unregister_fs:
unregister_filesystem(&zonefs_type);
destroy_inodecache:
zonefs_destroy_inodecache();
return ret;
}
static void __exit zonefs_exit(void)
{
zonefs_sysfs_exit();
zonefs_destroy_inodecache();
unregister_filesystem(&zonefs_type);
}

139
fs/zonefs/sysfs.c Normal file
View File

@ -0,0 +1,139 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Simple file system for zoned block devices exposing zones as files.
*
* Copyright (C) 2022 Western Digital Corporation or its affiliates.
*/
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
#include "zonefs.h"
struct zonefs_sysfs_attr {
struct attribute attr;
ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf);
};
static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr)
{
return container_of(attr, struct zonefs_sysfs_attr, attr);
}
#define ZONEFS_SYSFS_ATTR_RO(name) \
static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name)
#define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr
static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct zonefs_sb_info *sbi =
container_of(kobj, struct zonefs_sb_info, s_kobj);
struct zonefs_sysfs_attr *zonefs_attr =
container_of(attr, struct zonefs_sysfs_attr, attr);
if (!zonefs_attr->show)
return 0;
return zonefs_attr->show(sbi, buf);
}
static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files);
}
ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files);
static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files));
}
ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files);
static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files);
}
ZONEFS_SYSFS_ATTR_RO(max_active_seq_files);
static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files));
}
ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files);
static struct attribute *zonefs_sysfs_attrs[] = {
ATTR_LIST(max_wro_seq_files),
ATTR_LIST(nr_wro_seq_files),
ATTR_LIST(max_active_seq_files),
ATTR_LIST(nr_active_seq_files),
NULL,
};
ATTRIBUTE_GROUPS(zonefs_sysfs);
static void zonefs_sysfs_sb_release(struct kobject *kobj)
{
struct zonefs_sb_info *sbi =
container_of(kobj, struct zonefs_sb_info, s_kobj);
complete(&sbi->s_kobj_unregister);
}
static const struct sysfs_ops zonefs_sysfs_attr_ops = {
.show = zonefs_sysfs_attr_show,
};
static struct kobj_type zonefs_sb_ktype = {
.default_groups = zonefs_sysfs_groups,
.sysfs_ops = &zonefs_sysfs_attr_ops,
.release = zonefs_sysfs_sb_release,
};
static struct kobject *zonefs_sysfs_root;
int zonefs_sysfs_register(struct super_block *sb)
{
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
int ret;
init_completion(&sbi->s_kobj_unregister);
ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype,
zonefs_sysfs_root, "%s", sb->s_id);
if (ret) {
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
return ret;
}
sbi->s_sysfs_registered = true;
return 0;
}
void zonefs_sysfs_unregister(struct super_block *sb)
{
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
if (!sbi || !sbi->s_sysfs_registered)
return;
kobject_del(&sbi->s_kobj);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
}
int __init zonefs_sysfs_init(void)
{
zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj);
if (!zonefs_sysfs_root)
return -ENOMEM;
return 0;
}
void zonefs_sysfs_exit(void)
{
kobject_put(zonefs_sysfs_root);
zonefs_sysfs_root = NULL;
}

View File

@ -12,6 +12,7 @@
#include <linux/uuid.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/kobject.h>
/*
* Maximum length of file names: this only needs to be large enough to fit
@ -39,6 +40,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
}
#define ZONEFS_ZONE_OPEN (1 << 0)
#define ZONEFS_ZONE_ACTIVE (1 << 1)
/*
* In-memory inode data.
@ -182,8 +184,15 @@ struct zonefs_sb_info {
loff_t s_blocks;
loff_t s_used_blocks;
unsigned int s_max_open_zones;
atomic_t s_open_zones;
unsigned int s_max_wro_seq_files;
atomic_t s_wro_seq_files;
unsigned int s_max_active_seq_files;
atomic_t s_active_seq_files;
bool s_sysfs_registered;
struct kobject s_kobj;
struct completion s_kobj_unregister;
};
static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
@ -198,4 +207,9 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
#define zonefs_warn(sb, format, args...) \
pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args)
int zonefs_sysfs_register(struct super_block *sb);
void zonefs_sysfs_unregister(struct super_block *sb);
int zonefs_sysfs_init(void);
void zonefs_sysfs_exit(void);
#endif