xfs: introduce new file range commit ioctls

This patch introduces two more new ioctls to manage atomic updates to
file contents -- XFS_IOC_START_COMMIT and XFS_IOC_COMMIT_RANGE.  The
commit mechanism here is exactly the same as what XFS_IOC_EXCHANGE_RANGE
does, but with the additional requirement that file2 cannot have changed
since some sampling point.  The start-commit ioctl performs the sampling
of file attributes.

Note: This patch currently samples i_ctime during START_COMMIT and
checks that it hasn't changed during COMMIT_RANGE.  This isn't entirely
safe in kernels prior to 6.12 because ctime only had coarse grained
granularity and very fast updates could collide with a COMMIT_RANGE.
With the multi-granularity ctime introduced by Jeff Layton, it's now
possible to update ctime such that this does not happen.

It is critical, then, that this patch must not be backported to any
kernel that does not support fine-grained file change timestamps.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
This commit is contained in:
Darrick J. Wong 2024-08-30 15:36:47 -07:00
parent 431c1646e1
commit 398597c3ef
5 changed files with 243 additions and 3 deletions

View File

@ -825,6 +825,30 @@ struct xfs_exchange_range {
__u64 flags; /* see XFS_EXCHANGE_RANGE_* below */
};
/*
* Using the same definition of file2 as struct xfs_exchange_range, commit the
* contents of file1 into file2 if file2 has the same inode number, mtime, and
* ctime as the arguments provided to the call. The old contents of file2 will
* be moved to file1.
*
* Returns -EBUSY if there isn't an exact match for the file2 fields.
*
* Filesystems must be able to restart and complete the operation even after
* the system goes down.
*/
struct xfs_commit_range {
__s32 file1_fd;
__u32 pad; /* must be zeroes */
__u64 file1_offset; /* file1 offset, bytes */
__u64 file2_offset; /* file2 offset, bytes */
__u64 length; /* bytes to exchange */
__u64 flags; /* see XFS_EXCHANGE_RANGE_* below */
/* opaque file2 metadata for freshness checks */
__u64 file2_freshness[6];
};
/*
* Exchange file data all the way to the ends of both files, and then exchange
* the file sizes. This flag can be used to replace a file's contents with a
@ -997,6 +1021,8 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range)
#define XFS_IOC_START_COMMIT _IOR ('X', 130, struct xfs_commit_range)
#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */

View File

@ -72,6 +72,34 @@ xfs_exchrange_estimate(
return error;
}
/*
* Check that file2's metadata agree with the snapshot that we took for the
* range commit request.
*
* This should be called after the filesystem has locked /all/ inode metadata
* against modification.
*/
STATIC int
xfs_exchrange_check_freshness(
const struct xfs_exchrange *fxr,
struct xfs_inode *ip2)
{
struct inode *inode2 = VFS_I(ip2);
struct timespec64 ctime = inode_get_ctime(inode2);
struct timespec64 mtime = inode_get_mtime(inode2);
trace_xfs_exchrange_freshness(fxr, ip2);
/* Check that file2 hasn't otherwise been modified. */
if (fxr->file2_ino != ip2->i_ino ||
fxr->file2_gen != inode2->i_generation ||
!timespec64_equal(&fxr->file2_ctime, &ctime) ||
!timespec64_equal(&fxr->file2_mtime, &mtime))
return -EBUSY;
return 0;
}
#define QRETRY_IP1 (0x1)
#define QRETRY_IP2 (0x2)
@ -607,6 +635,12 @@ xfs_exchrange_prep(
if (error || fxr->length == 0)
return error;
if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
error = xfs_exchrange_check_freshness(fxr, ip2);
if (error)
return error;
}
/* Attach dquots to both inodes before changing block maps. */
error = xfs_qm_dqattach(ip2);
if (error)
@ -719,7 +753,8 @@ xfs_exchange_range(
if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
return -EXDEV;
if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
__XFS_EXCHANGE_RANGE_CHECK_FRESH2))
return -EINVAL;
/* Userspace requests only honored for regular files. */
@ -802,3 +837,109 @@ xfs_ioc_exchange_range(
fdput(file1);
return error;
}
/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
struct xfs_commit_range_fresh {
xfs_fsid_t fsid; /* m_fixedfsid */
__u64 file2_ino; /* inode number */
__s64 file2_mtime; /* modification time */
__s64 file2_ctime; /* change time */
__s32 file2_mtime_nsec; /* mod time, nsec */
__s32 file2_ctime_nsec; /* change time, nsec */
__u32 file2_gen; /* inode generation */
__u32 magic; /* zero */
};
#define XCR_FRESH_MAGIC 0x444F524B /* DORK */
/* Set up a commitrange operation by sampling file2's write-related attrs */
long
xfs_ioc_start_commit(
struct file *file,
struct xfs_commit_range __user *argp)
{
struct xfs_commit_range args = { };
struct timespec64 ts;
struct xfs_commit_range_fresh *kern_f;
struct xfs_commit_range_fresh __user *user_f;
struct inode *inode2 = file_inode(file);
struct xfs_inode *ip2 = XFS_I(inode2);
const unsigned int lockflags = XFS_IOLOCK_SHARED |
XFS_MMAPLOCK_SHARED |
XFS_ILOCK_SHARED;
BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
sizeof(args.file2_freshness));
kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
xfs_ilock(ip2, lockflags);
ts = inode_get_ctime(inode2);
kern_f->file2_ctime = ts.tv_sec;
kern_f->file2_ctime_nsec = ts.tv_nsec;
ts = inode_get_mtime(inode2);
kern_f->file2_mtime = ts.tv_sec;
kern_f->file2_mtime_nsec = ts.tv_nsec;
kern_f->file2_ino = ip2->i_ino;
kern_f->file2_gen = inode2->i_generation;
kern_f->magic = XCR_FRESH_MAGIC;
xfs_iunlock(ip2, lockflags);
user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
return -EFAULT;
return 0;
}
/*
* Exchange file1 and file2 contents if file2 has not been written since the
* start commit operation.
*/
long
xfs_ioc_commit_range(
struct file *file,
struct xfs_commit_range __user *argp)
{
struct xfs_exchrange fxr = {
.file2 = file,
};
struct xfs_commit_range args;
struct xfs_commit_range_fresh *kern_f;
struct xfs_inode *ip2 = XFS_I(file_inode(file));
struct xfs_mount *mp = ip2->i_mount;
struct fd file1;
int error;
kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
return -EINVAL;
if (kern_f->magic != XCR_FRESH_MAGIC)
return -EBUSY;
if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
return -EBUSY;
fxr.file1_offset = args.file1_offset;
fxr.file2_offset = args.file2_offset;
fxr.length = args.length;
fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
fxr.file2_ino = kern_f->file2_ino;
fxr.file2_gen = kern_f->file2_gen;
fxr.file2_mtime.tv_sec = kern_f->file2_mtime;
fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
file1 = fdget(args.file1_fd);
if (!file1.file)
return -EBADF;
fxr.file1 = file1.file;
error = xfs_exchange_range(&fxr);
fdput(file1);
return error;
}

View File

@ -10,8 +10,12 @@
#define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63)
#define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62)
/* Freshness check required */
#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2 (1ULL << 61)
#define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \
__XFS_EXCHANGE_RANGE_UPD_CMTIME2)
__XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \
__XFS_EXCHANGE_RANGE_CHECK_FRESH2)
struct xfs_exchrange {
struct file *file1;
@ -22,10 +26,20 @@ struct xfs_exchrange {
u64 length;
u64 flags; /* XFS_EXCHANGE_RANGE flags */
/* file2 metadata for freshness checks */
u64 file2_ino;
struct timespec64 file2_mtime;
struct timespec64 file2_ctime;
u32 file2_gen;
};
long xfs_ioc_exchange_range(struct file *file,
struct xfs_exchange_range __user *argp);
long xfs_ioc_start_commit(struct file *file,
struct xfs_commit_range __user *argp);
long xfs_ioc_commit_range(struct file *file,
struct xfs_commit_range __user *argp);
struct xfs_exchmaps_req;

View File

@ -1518,6 +1518,10 @@ xfs_file_ioctl(
case XFS_IOC_EXCHANGE_RANGE:
return xfs_ioc_exchange_range(filp, arg);
case XFS_IOC_START_COMMIT:
return xfs_ioc_start_commit(filp, arg);
case XFS_IOC_COMMIT_RANGE:
return xfs_ioc_commit_range(filp, arg);
default:
return -ENOTTY;

View File

@ -4926,7 +4926,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);
{ XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \
{ XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \
{ __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \
{ __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }
{ __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }, \
{ __XFS_EXCHANGE_RANGE_CHECK_FRESH2, "FRESH2" }
/* file exchange-range tracepoint class */
DECLARE_EVENT_CLASS(xfs_exchrange_class,
@ -4986,6 +4987,60 @@ DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);
TRACE_EVENT(xfs_exchrange_freshness,
TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2),
TP_ARGS(fxr, ip2),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ip2_ino)
__field(long long, ip2_mtime)
__field(long long, ip2_ctime)
__field(int, ip2_mtime_nsec)
__field(int, ip2_ctime_nsec)
__field(xfs_ino_t, file2_ino)
__field(long long, file2_mtime)
__field(long long, file2_ctime)
__field(int, file2_mtime_nsec)
__field(int, file2_ctime_nsec)
),
TP_fast_assign(
struct timespec64 ts64;
struct inode *inode2 = VFS_I(ip2);
__entry->dev = inode2->i_sb->s_dev;
__entry->ip2_ino = ip2->i_ino;
ts64 = inode_get_ctime(inode2);
__entry->ip2_ctime = ts64.tv_sec;
__entry->ip2_ctime_nsec = ts64.tv_nsec;
ts64 = inode_get_mtime(inode2);
__entry->ip2_mtime = ts64.tv_sec;
__entry->ip2_mtime_nsec = ts64.tv_nsec;
__entry->file2_ino = fxr->file2_ino;
__entry->file2_mtime = fxr->file2_mtime.tv_sec;
__entry->file2_ctime = fxr->file2_ctime.tv_sec;
__entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec;
__entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec;
),
TP_printk("dev %d:%d "
"ino 0x%llx mtime %lld:%d ctime %lld:%d -> "
"file 0x%llx mtime %lld:%d ctime %lld:%d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ip2_ino,
__entry->ip2_mtime,
__entry->ip2_mtime_nsec,
__entry->ip2_ctime,
__entry->ip2_ctime_nsec,
__entry->file2_ino,
__entry->file2_mtime,
__entry->file2_mtime_nsec,
__entry->file2_ctime,
__entry->file2_ctime_nsec)
);
TRACE_EVENT(xfs_exchmaps_overhead,
TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
unsigned long long rmapbt_blocks),