mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 14:11:52 +00:00
Fixes for 5.14-rc4:
* Fix a number of coordination bugs relating to cache flushes for metadata writeback, cache flushes for multi-buffer log writes, and FUA writes for single-buffer log writes. * Fix a bug with incorrect replay of attr3 blocks. * Fix unnecessary stalls when flushing logs to disk. * Fix spoofing problems when recovering realtime bitmap blocks. -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEUzaAxoMeQq6m2jMV+H93GTRKtOsFAmEC2PgACgkQ+H93GTRK tOtEvg//XQTcqKgO+60lJzhfgfGD8HsYWGcAc0UW8vu0I6gPNstd/PHKBCYkhT66 rp0l8CtZhbo3qj2ZJTIDvVxFeeAUcMhAIgU4gJB6OmW6/VV8NJlArfeyaA+85/lV lVYD53qBcc0IydDWlRD5oU8T55pqv9hg0W9WkpWrtjxoTlxPX5rDj7yrKEqiQs1M IUa5X4Qnwo/C2ATD/t2G3PIM7OxdCJ7YjyrZ27VWWRsUJW8DOqXtJX6HBs+VT9cM mh/IeIy60rmKgf2Ag2ZJCvrKnmqXqJFyGjEDzk6gXoqktQyWnUBLhQoyLh5r9UlA 4ThLGvPwUh5QEFOoo3cpN72X0wUeHcebfh4DgY/G3PeEK4J1CVq1UXLB1a8Si7X4 qf5ZqfUU4dr6v8C2AIqd9S/H6wm8v84hzA2uXca9tsw67rAcLc6N0rHydlLtn+n8 DL4PQYcUmn0LGrhIi2t/4ec80SGBf7ad/iDbr3A0K5NsV5kMl8dReg2yCDl9kHM0 yHFk8zLTKh5fs7fmmJXOORP33YMzstET9L1oKBv9cd9iMlHNUn27o9tpwwa2noM+ v6E+UCKlRTauj/MTxZITdmNzgGEymgu5bpbb77N24OTF9jf48OEW+cr0ZzgrVYtk wGuj9RFGcwneJoWjVPGURu1xBuC1AX9PbqnR9NQXbqmuwd6BINk= =pLW3 -----END PGP SIGNATURE----- Merge tag 'xfs-5.14-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux Pull xfs fixes from Darrick Wong: "This contains a bunch of bug fixes in XFS. Dave and I have been busy the last couple of weeks to find and fix as many log recovery bugs as we can find; here are the results so far. Go fstests -g recoveryloop! ;) - Fix a number of coordination bugs relating to cache flushes for metadata writeback, cache flushes for multi-buffer log writes, and FUA writes for single-buffer log writes - Fix a bug with incorrect replay of attr3 blocks - Fix unnecessary stalls when flushing logs to disk - Fix spoofing problems when recovering realtime bitmap blocks" * tag 'xfs-5.14-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: xfs: prevent spoofing of rtbitmap blocks when recovering buffers xfs: limit iclog tail updates xfs: need to see iclog flags in tracing xfs: Enforce attr3 buffer recovery order xfs: logging the on disk inode LSN can make it go backwards xfs: avoid unnecessary waits in xfs_log_force_lsn() xfs: log forces imply data device cache flushes xfs: factor out forced iclog flushes xfs: fix ordering violation between cache flushes and tail updates xfs: fold __xlog_state_release_iclog into xlog_state_release_iclog xfs: external logs need to flush data device xfs: flush data dev on external log write
This commit is contained in:
commit
aa6603266c
@ -411,7 +411,16 @@ struct xfs_log_dinode {
|
||||
/* start of the extended dinode, writable fields */
|
||||
uint32_t di_crc; /* CRC of the inode */
|
||||
uint64_t di_changecount; /* number of attribute changes */
|
||||
xfs_lsn_t di_lsn; /* flush sequence */
|
||||
|
||||
/*
|
||||
* The LSN we write to this field during formatting is not a reflection
|
||||
* of the current on-disk LSN. It should never be used for recovery
|
||||
* sequencing, nor should it be recovered into the on-disk inode at all.
|
||||
* See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
|
||||
* for details.
|
||||
*/
|
||||
xfs_lsn_t di_lsn;
|
||||
|
||||
uint64_t di_flags2; /* more random flags */
|
||||
uint32_t di_cowextsize; /* basic cow extent size for file */
|
||||
uint8_t di_pad2[12]; /* more padding for future expansion */
|
||||
|
@ -698,7 +698,8 @@ xlog_recover_do_inode_buffer(
|
||||
static xfs_lsn_t
|
||||
xlog_recover_get_buf_lsn(
|
||||
struct xfs_mount *mp,
|
||||
struct xfs_buf *bp)
|
||||
struct xfs_buf *bp,
|
||||
struct xfs_buf_log_format *buf_f)
|
||||
{
|
||||
uint32_t magic32;
|
||||
uint16_t magic16;
|
||||
@ -706,11 +707,20 @@ xlog_recover_get_buf_lsn(
|
||||
void *blk = bp->b_addr;
|
||||
uuid_t *uuid;
|
||||
xfs_lsn_t lsn = -1;
|
||||
uint16_t blft;
|
||||
|
||||
/* v4 filesystems always recover immediately */
|
||||
if (!xfs_sb_version_hascrc(&mp->m_sb))
|
||||
goto recover_immediately;
|
||||
|
||||
/*
|
||||
* realtime bitmap and summary file blocks do not have magic numbers or
|
||||
* UUIDs, so we must recover them immediately.
|
||||
*/
|
||||
blft = xfs_blft_from_flags(buf_f);
|
||||
if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF)
|
||||
goto recover_immediately;
|
||||
|
||||
magic32 = be32_to_cpu(*(__be32 *)blk);
|
||||
switch (magic32) {
|
||||
case XFS_ABTB_CRC_MAGIC:
|
||||
@ -796,6 +806,7 @@ xlog_recover_get_buf_lsn(
|
||||
switch (magicda) {
|
||||
case XFS_DIR3_LEAF1_MAGIC:
|
||||
case XFS_DIR3_LEAFN_MAGIC:
|
||||
case XFS_ATTR3_LEAF_MAGIC:
|
||||
case XFS_DA3_NODE_MAGIC:
|
||||
lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
|
||||
uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
|
||||
@ -919,7 +930,7 @@ xlog_recover_buf_commit_pass2(
|
||||
* the verifier will be reset to match whatever recover turns that
|
||||
* buffer into.
|
||||
*/
|
||||
lsn = xlog_recover_get_buf_lsn(mp, bp);
|
||||
lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f);
|
||||
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
|
||||
trace_xfs_log_recover_buf_skip(log, buf_f);
|
||||
xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
|
||||
|
@ -145,7 +145,8 @@ xfs_log_dinode_to_disk_ts(
|
||||
STATIC void
|
||||
xfs_log_dinode_to_disk(
|
||||
struct xfs_log_dinode *from,
|
||||
struct xfs_dinode *to)
|
||||
struct xfs_dinode *to,
|
||||
xfs_lsn_t lsn)
|
||||
{
|
||||
to->di_magic = cpu_to_be16(from->di_magic);
|
||||
to->di_mode = cpu_to_be16(from->di_mode);
|
||||
@ -182,7 +183,7 @@ xfs_log_dinode_to_disk(
|
||||
to->di_flags2 = cpu_to_be64(from->di_flags2);
|
||||
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
|
||||
to->di_ino = cpu_to_be64(from->di_ino);
|
||||
to->di_lsn = cpu_to_be64(from->di_lsn);
|
||||
to->di_lsn = cpu_to_be64(lsn);
|
||||
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
|
||||
uuid_copy(&to->di_uuid, &from->di_uuid);
|
||||
to->di_flushiter = 0;
|
||||
@ -261,16 +262,25 @@ xlog_recover_inode_commit_pass2(
|
||||
}
|
||||
|
||||
/*
|
||||
* If the inode has an LSN in it, recover the inode only if it's less
|
||||
* than the lsn of the transaction we are replaying. Note: we still
|
||||
* need to replay an owner change even though the inode is more recent
|
||||
* than the transaction as there is no guarantee that all the btree
|
||||
* blocks are more recent than this transaction, too.
|
||||
* If the inode has an LSN in it, recover the inode only if the on-disk
|
||||
* inode's LSN is older than the lsn of the transaction we are
|
||||
* replaying. We can have multiple checkpoints with the same start LSN,
|
||||
* so the current LSN being equal to the on-disk LSN doesn't necessarily
|
||||
* mean that the on-disk inode is more recent than the change being
|
||||
* replayed.
|
||||
*
|
||||
* We must check the current_lsn against the on-disk inode
|
||||
* here because the we can't trust the log dinode to contain a valid LSN
|
||||
* (see comment below before replaying the log dinode for details).
|
||||
*
|
||||
* Note: we still need to replay an owner change even though the inode
|
||||
* is more recent than the transaction as there is no guarantee that all
|
||||
* the btree blocks are more recent than this transaction, too.
|
||||
*/
|
||||
if (dip->di_version >= 3) {
|
||||
xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
|
||||
|
||||
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
|
||||
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
|
||||
trace_xfs_log_recover_inode_skip(log, in_f);
|
||||
error = 0;
|
||||
goto out_owner_change;
|
||||
@ -368,8 +378,17 @@ xlog_recover_inode_commit_pass2(
|
||||
goto out_release;
|
||||
}
|
||||
|
||||
/* recover the log dinode inode into the on disk inode */
|
||||
xfs_log_dinode_to_disk(ldip, dip);
|
||||
/*
|
||||
* Recover the log dinode inode into the on disk inode.
|
||||
*
|
||||
* The LSN in the log dinode is garbage - it can be zero or reflect
|
||||
* stale in-memory runtime state that isn't coherent with the changes
|
||||
* logged in this transaction or the changes written to the on-disk
|
||||
* inode. Hence we write the current lSN into the inode because that
|
||||
* matches what xfs_iflush() would write inode the inode when flushing
|
||||
* the changes in this transaction.
|
||||
*/
|
||||
xfs_log_dinode_to_disk(ldip, dip, current_lsn);
|
||||
|
||||
fields = in_f->ilf_fields;
|
||||
if (fields & XFS_ILOG_DEV)
|
||||
|
251
fs/xfs/xfs_log.c
251
fs/xfs/xfs_log.c
@ -78,13 +78,12 @@ xlog_verify_iclog(
|
||||
STATIC void
|
||||
xlog_verify_tail_lsn(
|
||||
struct xlog *log,
|
||||
struct xlog_in_core *iclog,
|
||||
xfs_lsn_t tail_lsn);
|
||||
struct xlog_in_core *iclog);
|
||||
#else
|
||||
#define xlog_verify_dest_ptr(a,b)
|
||||
#define xlog_verify_grant_tail(a)
|
||||
#define xlog_verify_iclog(a,b,c)
|
||||
#define xlog_verify_tail_lsn(a,b,c)
|
||||
#define xlog_verify_tail_lsn(a,b)
|
||||
#endif
|
||||
|
||||
STATIC int
|
||||
@ -487,51 +486,80 @@ out_error:
|
||||
return error;
|
||||
}
|
||||
|
||||
static bool
|
||||
__xlog_state_release_iclog(
|
||||
struct xlog *log,
|
||||
struct xlog_in_core *iclog)
|
||||
{
|
||||
lockdep_assert_held(&log->l_icloglock);
|
||||
|
||||
if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
|
||||
/* update tail before writing to iclog */
|
||||
xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
|
||||
|
||||
iclog->ic_state = XLOG_STATE_SYNCING;
|
||||
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
|
||||
xlog_verify_tail_lsn(log, iclog, tail_lsn);
|
||||
/* cycle incremented when incrementing curr_block */
|
||||
trace_xlog_iclog_syncing(iclog, _RET_IP_);
|
||||
return true;
|
||||
}
|
||||
|
||||
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush iclog to disk if this is the last reference to the given iclog and the
|
||||
* it is in the WANT_SYNC state.
|
||||
*
|
||||
* If the caller passes in a non-zero @old_tail_lsn and the current log tail
|
||||
* does not match, there may be metadata on disk that must be persisted before
|
||||
* this iclog is written. To satisfy that requirement, set the
|
||||
* XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
|
||||
* log tail value.
|
||||
*
|
||||
* If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
|
||||
* log tail is updated correctly. NEED_FUA indicates that the iclog will be
|
||||
* written to stable storage, and implies that a commit record is contained
|
||||
* within the iclog. We need to ensure that the log tail does not move beyond
|
||||
* the tail that the first commit record in the iclog ordered against, otherwise
|
||||
* correct recovery of that checkpoint becomes dependent on future operations
|
||||
* performed on this iclog.
|
||||
*
|
||||
* Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
|
||||
* current tail into iclog. Once the iclog tail is set, future operations must
|
||||
* not modify it, otherwise they potentially violate ordering constraints for
|
||||
* the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
|
||||
* the iclog will get zeroed on activation of the iclog after sync, so we
|
||||
* always capture the tail lsn on the iclog on the first NEED_FUA release
|
||||
* regardless of the number of active reference counts on this iclog.
|
||||
*/
|
||||
|
||||
int
|
||||
xlog_state_release_iclog(
|
||||
struct xlog *log,
|
||||
struct xlog_in_core *iclog)
|
||||
struct xlog_in_core *iclog,
|
||||
xfs_lsn_t old_tail_lsn)
|
||||
{
|
||||
xfs_lsn_t tail_lsn;
|
||||
lockdep_assert_held(&log->l_icloglock);
|
||||
|
||||
trace_xlog_iclog_release(iclog, _RET_IP_);
|
||||
if (iclog->ic_state == XLOG_STATE_IOERROR)
|
||||
return -EIO;
|
||||
|
||||
if (atomic_dec_and_test(&iclog->ic_refcnt) &&
|
||||
__xlog_state_release_iclog(log, iclog)) {
|
||||
spin_unlock(&log->l_icloglock);
|
||||
xlog_sync(log, iclog);
|
||||
spin_lock(&log->l_icloglock);
|
||||
/*
|
||||
* Grabbing the current log tail needs to be atomic w.r.t. the writing
|
||||
* of the tail LSN into the iclog so we guarantee that the log tail does
|
||||
* not move between deciding if a cache flush is required and writing
|
||||
* the LSN into the iclog below.
|
||||
*/
|
||||
if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
|
||||
tail_lsn = xlog_assign_tail_lsn(log->l_mp);
|
||||
|
||||
if (old_tail_lsn && tail_lsn != old_tail_lsn)
|
||||
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
|
||||
|
||||
if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
|
||||
!iclog->ic_header.h_tail_lsn)
|
||||
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
|
||||
}
|
||||
|
||||
if (!atomic_dec_and_test(&iclog->ic_refcnt))
|
||||
return 0;
|
||||
|
||||
if (iclog->ic_state != XLOG_STATE_WANT_SYNC) {
|
||||
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
iclog->ic_state = XLOG_STATE_SYNCING;
|
||||
if (!iclog->ic_header.h_tail_lsn)
|
||||
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
|
||||
xlog_verify_tail_lsn(log, iclog);
|
||||
trace_xlog_iclog_syncing(iclog, _RET_IP_);
|
||||
|
||||
spin_unlock(&log->l_icloglock);
|
||||
xlog_sync(log, iclog);
|
||||
spin_lock(&log->l_icloglock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -773,6 +801,21 @@ xfs_log_mount_cancel(
|
||||
xfs_log_unmount(mp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush out the iclog to disk ensuring that device caches are flushed and
|
||||
* the iclog hits stable storage before any completion waiters are woken.
|
||||
*/
|
||||
static inline int
|
||||
xlog_force_iclog(
|
||||
struct xlog_in_core *iclog)
|
||||
{
|
||||
atomic_inc(&iclog->ic_refcnt);
|
||||
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
|
||||
if (iclog->ic_state == XLOG_STATE_ACTIVE)
|
||||
xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
|
||||
return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for the iclog and all prior iclogs to be written disk as required by the
|
||||
* log force state machine. Waiting on ic_force_wait ensures iclog completions
|
||||
@ -827,13 +870,6 @@ xlog_write_unmount_record(
|
||||
/* account for space used by record data */
|
||||
ticket->t_curr_res -= sizeof(ulf);
|
||||
|
||||
/*
|
||||
* For external log devices, we need to flush the data device cache
|
||||
* first to ensure all metadata writeback is on stable storage before we
|
||||
* stamp the tail LSN into the unmount record.
|
||||
*/
|
||||
if (log->l_targ != log->l_mp->m_ddev_targp)
|
||||
blkdev_issue_flush(log->l_targ->bt_bdev);
|
||||
return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS);
|
||||
}
|
||||
|
||||
@ -865,18 +901,7 @@ out_err:
|
||||
|
||||
spin_lock(&log->l_icloglock);
|
||||
iclog = log->l_iclog;
|
||||
atomic_inc(&iclog->ic_refcnt);
|
||||
if (iclog->ic_state == XLOG_STATE_ACTIVE)
|
||||
xlog_state_switch_iclogs(log, iclog, 0);
|
||||
else
|
||||
ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
|
||||
iclog->ic_state == XLOG_STATE_IOERROR);
|
||||
/*
|
||||
* Ensure the journal is fully flushed and on stable storage once the
|
||||
* iclog containing the unmount record is written.
|
||||
*/
|
||||
iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
|
||||
error = xlog_state_release_iclog(log, iclog);
|
||||
error = xlog_force_iclog(iclog);
|
||||
xlog_wait_on_iclog(iclog);
|
||||
|
||||
if (tic) {
|
||||
@ -1796,10 +1821,20 @@ xlog_write_iclog(
|
||||
* metadata writeback and causing priority inversions.
|
||||
*/
|
||||
iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
|
||||
if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH)
|
||||
if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
|
||||
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
|
||||
/*
|
||||
* For external log devices, we also need to flush the data
|
||||
* device cache first to ensure all metadata writeback covered
|
||||
* by the LSN in this iclog is on stable storage. This is slow,
|
||||
* but it *must* complete before we issue the external log IO.
|
||||
*/
|
||||
if (log->l_targ != log->l_mp->m_ddev_targp)
|
||||
blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev);
|
||||
}
|
||||
if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
|
||||
iclog->ic_bio.bi_opf |= REQ_FUA;
|
||||
|
||||
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
|
||||
|
||||
if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
|
||||
@ -2310,7 +2345,7 @@ xlog_write_copy_finish(
|
||||
return 0;
|
||||
|
||||
release_iclog:
|
||||
error = xlog_state_release_iclog(log, iclog);
|
||||
error = xlog_state_release_iclog(log, iclog, 0);
|
||||
spin_unlock(&log->l_icloglock);
|
||||
return error;
|
||||
}
|
||||
@ -2529,7 +2564,7 @@ next_lv:
|
||||
ASSERT(optype & XLOG_COMMIT_TRANS);
|
||||
*commit_iclog = iclog;
|
||||
} else {
|
||||
error = xlog_state_release_iclog(log, iclog);
|
||||
error = xlog_state_release_iclog(log, iclog, 0);
|
||||
}
|
||||
spin_unlock(&log->l_icloglock);
|
||||
|
||||
@ -2567,6 +2602,7 @@ xlog_state_activate_iclog(
|
||||
memset(iclog->ic_header.h_cycle_data, 0,
|
||||
sizeof(iclog->ic_header.h_cycle_data));
|
||||
iclog->ic_header.h_lsn = 0;
|
||||
iclog->ic_header.h_tail_lsn = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2967,7 +3003,7 @@ restart:
|
||||
* reference to the iclog.
|
||||
*/
|
||||
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
|
||||
error = xlog_state_release_iclog(log, iclog);
|
||||
error = xlog_state_release_iclog(log, iclog, 0);
|
||||
spin_unlock(&log->l_icloglock);
|
||||
if (error)
|
||||
return error;
|
||||
@ -3131,6 +3167,35 @@ xlog_state_switch_iclogs(
|
||||
log->l_iclog = iclog->ic_next;
|
||||
}
|
||||
|
||||
/*
|
||||
* Force the iclog to disk and check if the iclog has been completed before
|
||||
* xlog_force_iclog() returns. This can happen on synchronous (e.g.
|
||||
* pmem) or fast async storage because we drop the icloglock to issue the IO.
|
||||
* If completion has already occurred, tell the caller so that it can avoid an
|
||||
* unnecessary wait on the iclog.
|
||||
*/
|
||||
static int
|
||||
xlog_force_and_check_iclog(
|
||||
struct xlog_in_core *iclog,
|
||||
bool *completed)
|
||||
{
|
||||
xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
|
||||
int error;
|
||||
|
||||
*completed = false;
|
||||
error = xlog_force_iclog(iclog);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/*
|
||||
* If the iclog has already been completed and reused the header LSN
|
||||
* will have been rewritten by completion
|
||||
*/
|
||||
if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
|
||||
*completed = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out all data in the in-core log as of this exact moment in time.
|
||||
*
|
||||
@ -3165,7 +3230,6 @@ xfs_log_force(
|
||||
{
|
||||
struct xlog *log = mp->m_log;
|
||||
struct xlog_in_core *iclog;
|
||||
xfs_lsn_t lsn;
|
||||
|
||||
XFS_STATS_INC(mp, xs_log_force);
|
||||
trace_xfs_log_force(mp, 0, _RET_IP_);
|
||||
@ -3193,39 +3257,33 @@ xfs_log_force(
|
||||
iclog = iclog->ic_prev;
|
||||
} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
|
||||
if (atomic_read(&iclog->ic_refcnt) == 0) {
|
||||
/*
|
||||
* We are the only one with access to this iclog.
|
||||
*
|
||||
* Flush it out now. There should be a roundoff of zero
|
||||
* to show that someone has already taken care of the
|
||||
* roundoff from the previous sync.
|
||||
*/
|
||||
atomic_inc(&iclog->ic_refcnt);
|
||||
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
|
||||
xlog_state_switch_iclogs(log, iclog, 0);
|
||||
if (xlog_state_release_iclog(log, iclog))
|
||||
/* We have exclusive access to this iclog. */
|
||||
bool completed;
|
||||
|
||||
if (xlog_force_and_check_iclog(iclog, &completed))
|
||||
goto out_error;
|
||||
|
||||
if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
|
||||
if (completed)
|
||||
goto out_unlock;
|
||||
} else {
|
||||
/*
|
||||
* Someone else is writing to this iclog.
|
||||
*
|
||||
* Use its call to flush out the data. However, the
|
||||
* other thread may not force out this LR, so we mark
|
||||
* it WANT_SYNC.
|
||||
* Someone else is still writing to this iclog, so we
|
||||
* need to ensure that when they release the iclog it
|
||||
* gets synced immediately as we may be waiting on it.
|
||||
*/
|
||||
xlog_state_switch_iclogs(log, iclog, 0);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If the head iclog is not active nor dirty, we just attach
|
||||
* ourselves to the head and go to sleep if necessary.
|
||||
*/
|
||||
;
|
||||
}
|
||||
|
||||
/*
|
||||
* The iclog we are about to wait on may contain the checkpoint pushed
|
||||
* by the above xlog_cil_force() call, but it may not have been pushed
|
||||
* to disk yet. Like the ACTIVE case above, we need to make sure caches
|
||||
* are flushed when this iclog is written.
|
||||
*/
|
||||
if (iclog->ic_state == XLOG_STATE_WANT_SYNC)
|
||||
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
|
||||
|
||||
if (flags & XFS_LOG_SYNC)
|
||||
return xlog_wait_on_iclog(iclog);
|
||||
out_unlock:
|
||||
@ -3245,6 +3303,7 @@ xlog_force_lsn(
|
||||
bool already_slept)
|
||||
{
|
||||
struct xlog_in_core *iclog;
|
||||
bool completed;
|
||||
|
||||
spin_lock(&log->l_icloglock);
|
||||
iclog = log->l_iclog;
|
||||
@ -3258,7 +3317,8 @@ xlog_force_lsn(
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (iclog->ic_state == XLOG_STATE_ACTIVE) {
|
||||
switch (iclog->ic_state) {
|
||||
case XLOG_STATE_ACTIVE:
|
||||
/*
|
||||
* We sleep here if we haven't already slept (e.g. this is the
|
||||
* first time we've looked at the correct iclog buf) and the
|
||||
@ -3281,12 +3341,31 @@ xlog_force_lsn(
|
||||
&log->l_icloglock);
|
||||
return -EAGAIN;
|
||||
}
|
||||
atomic_inc(&iclog->ic_refcnt);
|
||||
xlog_state_switch_iclogs(log, iclog, 0);
|
||||
if (xlog_state_release_iclog(log, iclog))
|
||||
if (xlog_force_and_check_iclog(iclog, &completed))
|
||||
goto out_error;
|
||||
if (log_flushed)
|
||||
*log_flushed = 1;
|
||||
if (completed)
|
||||
goto out_unlock;
|
||||
break;
|
||||
case XLOG_STATE_WANT_SYNC:
|
||||
/*
|
||||
* This iclog may contain the checkpoint pushed by the
|
||||
* xlog_cil_force_seq() call, but there are other writers still
|
||||
* accessing it so it hasn't been pushed to disk yet. Like the
|
||||
* ACTIVE case above, we need to make sure caches are flushed
|
||||
* when this iclog is written.
|
||||
*/
|
||||
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* The entire checkpoint was written by the CIL force and is on
|
||||
* its way to disk already. It will be stable when it
|
||||
* completes, so we don't need to manipulate caches here at all.
|
||||
* We just need to wait for completion if necessary.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
if (flags & XFS_LOG_SYNC)
|
||||
@ -3559,10 +3638,10 @@ xlog_verify_grant_tail(
|
||||
STATIC void
|
||||
xlog_verify_tail_lsn(
|
||||
struct xlog *log,
|
||||
struct xlog_in_core *iclog,
|
||||
xfs_lsn_t tail_lsn)
|
||||
struct xlog_in_core *iclog)
|
||||
{
|
||||
int blocks;
|
||||
xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
|
||||
int blocks;
|
||||
|
||||
if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
|
||||
blocks =
|
||||
|
@ -654,8 +654,9 @@ xlog_cil_push_work(
|
||||
struct xfs_trans_header thdr;
|
||||
struct xfs_log_iovec lhdr;
|
||||
struct xfs_log_vec lvhdr = { NULL };
|
||||
xfs_lsn_t preflush_tail_lsn;
|
||||
xfs_lsn_t commit_lsn;
|
||||
xfs_lsn_t push_seq;
|
||||
xfs_csn_t push_seq;
|
||||
struct bio bio;
|
||||
DECLARE_COMPLETION_ONSTACK(bdev_flush);
|
||||
|
||||
@ -730,7 +731,15 @@ xlog_cil_push_work(
|
||||
* because we hold the flush lock exclusively. Hence we can now issue
|
||||
* a cache flush to ensure all the completed metadata in the journal we
|
||||
* are about to overwrite is on stable storage.
|
||||
*
|
||||
* Because we are issuing this cache flush before we've written the
|
||||
* tail lsn to the iclog, we can have metadata IO completions move the
|
||||
* tail forwards between the completion of this flush and the iclog
|
||||
* being written. In this case, we need to re-issue the cache flush
|
||||
* before the iclog write. To detect whether the log tail moves, sample
|
||||
* the tail LSN *before* we issue the flush.
|
||||
*/
|
||||
preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
|
||||
xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
|
||||
&bdev_flush);
|
||||
|
||||
@ -941,7 +950,7 @@ restart:
|
||||
* storage.
|
||||
*/
|
||||
commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
|
||||
xlog_state_release_iclog(log, commit_iclog);
|
||||
xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn);
|
||||
spin_unlock(&log->l_icloglock);
|
||||
return;
|
||||
|
||||
|
@ -59,6 +59,16 @@ enum xlog_iclog_state {
|
||||
{ XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \
|
||||
{ XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" }
|
||||
|
||||
/*
|
||||
* In core log flags
|
||||
*/
|
||||
#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
|
||||
#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
|
||||
|
||||
#define XLOG_ICL_STRINGS \
|
||||
{ XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
|
||||
{ XLOG_ICL_NEED_FUA, "XLOG_ICL_NEED_FUA" }
|
||||
|
||||
|
||||
/*
|
||||
* Log ticket flags
|
||||
@ -143,9 +153,6 @@ enum xlog_iclog_state {
|
||||
|
||||
#define XLOG_COVER_OPS 5
|
||||
|
||||
#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
|
||||
#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
|
||||
|
||||
/* Ticket reservation region accounting */
|
||||
#define XLOG_TIC_LEN_MAX 15
|
||||
|
||||
@ -497,7 +504,8 @@ int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
|
||||
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
|
||||
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
|
||||
|
||||
int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
|
||||
int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
|
||||
xfs_lsn_t log_tail_lsn);
|
||||
|
||||
/*
|
||||
* When we crack an atomic LSN, we sample it first so that the value will not
|
||||
|
@ -3944,6 +3944,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class,
|
||||
__field(uint32_t, state)
|
||||
__field(int32_t, refcount)
|
||||
__field(uint32_t, offset)
|
||||
__field(uint32_t, flags)
|
||||
__field(unsigned long long, lsn)
|
||||
__field(unsigned long, caller_ip)
|
||||
),
|
||||
@ -3952,15 +3953,17 @@ DECLARE_EVENT_CLASS(xlog_iclog_class,
|
||||
__entry->state = iclog->ic_state;
|
||||
__entry->refcount = atomic_read(&iclog->ic_refcnt);
|
||||
__entry->offset = iclog->ic_offset;
|
||||
__entry->flags = iclog->ic_flags;
|
||||
__entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn);
|
||||
__entry->caller_ip = caller_ip;
|
||||
),
|
||||
TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx caller %pS",
|
||||
TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__print_symbolic(__entry->state, XLOG_STATE_STRINGS),
|
||||
__entry->refcount,
|
||||
__entry->offset,
|
||||
__entry->lsn,
|
||||
__print_flags(__entry->flags, "|", XLOG_ICL_STRINGS),
|
||||
(char *)__entry->caller_ip)
|
||||
|
||||
);
|
||||
|
Loading…
Reference in New Issue
Block a user