From 33c7a2bc48a81fa714572f8ce29f29bc17e6faf0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:55:59 +1100 Subject: [PATCH 01/78] xfs: xfs_syncd_stop must die xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated functionailty together that actually have different start and stop requirements. Kill these functions and open code the start/stop methods for each of the background functions. Subsequent patches will move the start/stop functions around to the correct places to avoid races and shutdown issues. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 25 ++++++++++++++++++------- fs/xfs/xfs_sync.c | 30 ++++-------------------------- fs/xfs/xfs_sync.h | 6 ++++-- 3 files changed, 26 insertions(+), 35 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 26a09bd7f975..37d1bbce047d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1008,7 +1008,11 @@ xfs_fs_put_super( xfs_filestream_unmount(mp); cancel_delayed_work_sync(&mp->m_sync_work); xfs_unmountfs(mp); - xfs_syncd_stop(mp); + + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); + xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1384,9 +1388,11 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_syncd_init(mp); - if (error) - goto out_filestream_unmount; + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + + xfs_syncd_queue_sync(mp); error = xfs_mountfs(mp); if (error) @@ -1409,8 +1415,10 @@ xfs_fs_fill_super( return 0; out_syncd_stop: - xfs_syncd_stop(mp); - out_filestream_unmount: + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); + xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1429,7 +1437,10 @@ out_destroy_workqueues: out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - xfs_syncd_stop(mp); + + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 9500caf15acf..7502f0621fb9 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -370,7 +370,7 @@ xfs_quiesce_attr( xfs_buf_unlock(mp->m_sb_bp); } -static void +void xfs_syncd_queue_sync( struct xfs_mount *mp) { @@ -383,7 +383,7 @@ xfs_syncd_queue_sync( * disk quotas. We might need to cover the log to indicate that the * filesystem is idle and not frozen. */ -STATIC void +void xfs_sync_worker( struct work_struct *work) { @@ -445,7 +445,7 @@ xfs_syncd_queue_reclaim( * goes low. It scans as quickly as possible avoiding locked inodes or those * already being flushed, and once done schedules a future pass. */ -STATIC void +void xfs_reclaim_worker( struct work_struct *work) { @@ -478,7 +478,7 @@ xfs_flush_inodes( flush_work(&mp->m_flush_work); } -STATIC void +void xfs_flush_worker( struct work_struct *work) { @@ -489,28 +489,6 @@ xfs_flush_worker( xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); } -int -xfs_syncd_init( - struct xfs_mount *mp) -{ - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - - xfs_syncd_queue_sync(mp); - - return 0; -} - -void -xfs_syncd_stop( - struct xfs_mount *mp) -{ - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e7ac6e..3f59e5bed66b 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,8 +26,10 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -int xfs_syncd_init(struct xfs_mount *mp); -void xfs_syncd_stop(struct xfs_mount *mp); +void xfs_syncd_queue_sync(struct xfs_mount *mp); +void xfs_sync_worker(struct work_struct *work); +void xfs_flush_worker(struct work_struct *work); +void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); From 7e18530bef6a18a5479690ae7e8256319ecf1300 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:00 +1100 Subject: [PATCH 02/78] xfs: rationalise xfs_mount_wq users Instead of starting and stopping background work on the xfs_mount_wq all at the same time, separate them to where they really are needed to start and stop. The xfs_sync_worker, only needs to be started after all the mount processing has completed successfully, while it needs to be stopped before the log is unmounted. The xfs_reclaim_worker is started on demand, and can be stopped before the unmount process does it's own inode reclaim pass. The xfs_flush_inodes work is run on demand, and so we really only need to ensure that it has stopped running before we start processing an unmount, freeze or remount,ro. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 6 ++++-- fs/xfs/xfs_super.c | 36 +++++++++++++++--------------------- fs/xfs/xfs_sync.c | 21 +++++---------------- 3 files changed, 24 insertions(+), 39 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b2bd3a0e6376..d9a31c6a0c53 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1450,9 +1450,11 @@ xfs_unmountfs( /* * And reclaim all inodes. At this point there should be no dirty - * inode, and none should be pinned or locked, but use synchronous - * reclaim just to be sure. + * inodes and none should be pinned or locked, but use synchronous + * reclaim just to be sure. We can stop background inode reclaim + * here as well if it is still running. */ + cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 37d1bbce047d..9805cac81fc9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1005,14 +1005,12 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - xfs_filestream_unmount(mp); cancel_delayed_work_sync(&mp->m_sync_work); - xfs_unmountfs(mp); - - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); cancel_work_sync(&mp->m_flush_work); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1325,6 +1323,9 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; sb->s_fs_info = mp; @@ -1388,15 +1389,9 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); - INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - - xfs_syncd_queue_sync(mp); - error = xfs_mountfs(mp); if (error) - goto out_syncd_stop; + goto out_filestream_unmount; root = igrab(VFS_I(mp->m_rootip)); if (!root) { @@ -1413,12 +1408,15 @@ xfs_fs_fill_super( goto out_unmount; } - return 0; - out_syncd_stop: - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); + /* + * The filesystem is successfully mounted, so we can start background + * sync work now. + */ + xfs_syncd_queue_sync(mp); + return 0; + + out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1437,10 +1435,6 @@ out_destroy_workqueues: out_unmount: xfs_filestream_unmount(mp); xfs_unmountfs(mp); - - cancel_delayed_work_sync(&mp->m_sync_work); - cancel_delayed_work_sync(&mp->m_reclaim_work); - cancel_work_sync(&mp->m_flush_work); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 7502f0621fb9..a68761696ab5 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -379,9 +379,9 @@ xfs_syncd_queue_sync( } /* - * Every sync period we need to unpin all items, reclaim inodes and sync - * disk quotas. We might need to cover the log to indicate that the - * filesystem is idle and not frozen. + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle and not frozen. */ void xfs_sync_worker( @@ -391,17 +391,7 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - /* - * We shouldn't write/force the log if we are in the mount/unmount - * process or on a read only filesystem. The workqueue still needs to be - * active in both cases, however, because it is used for inode reclaim - * during these times. Use the MS_ACTIVE flag to avoid doing anything - * during mount. Doing work during unmount is avoided by calling - * cancel_delayed_work_sync on this work queue before tearing down - * the ail and the log in xfs_log_unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ if (mp->m_super->s_writers.frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) @@ -409,8 +399,7 @@ xfs_sync_worker( else xfs_log_force(mp, 0); - /* start pushing all the metadata that is currently - * dirty */ + /* start pushing all the metadata that is currently dirty */ xfs_ail_push_all(mp->m_ail); } From 7f7bebefba152c5bdfe961cd2e97e8695a32998c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:01 +1100 Subject: [PATCH 03/78] xfs: don't run the sync work if the filesystem is read-only If the filesystem is mounted or remounted read-only, stop the sync worker that tries to flush or cover the log if the filesystem is dirty. It's read-only, so it isn't dirty. Restart it on a remount,rw as necessary. This avoids the need for RO checks in the work. Similarly, stop the sync work when the filesystem is frozen, and start it again when the filesysetm is thawed. This avoids the need for special freeze checks in the work. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 2 ++ fs/xfs/xfs_sync.c | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9805cac81fc9..20fa955d80d1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1200,6 +1200,7 @@ xfs_fs_remount( * value if it is non-zero, otherwise go with the default. */ xfs_restore_resvblks(mp); + xfs_syncd_queue_sync(mp); } /* rw -> ro */ @@ -1245,6 +1246,7 @@ xfs_fs_unfreeze( struct xfs_mount *mp = XFS_M(sb); xfs_restore_resvblks(mp); + xfs_syncd_queue_sync(mp); return 0; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index a68761696ab5..e898d1807044 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -323,6 +323,9 @@ xfs_quiesce_data( * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to * wait for any remaining transactions to drain out before proceeding. + * + * Note: this stops background sync work - the callers must ensure it is started + * again when appropriate. */ void xfs_quiesce_attr( @@ -341,6 +344,9 @@ xfs_quiesce_attr( /* flush all pending changes from the AIL */ xfs_ail_push_all_sync(mp->m_ail); + /* stop background sync work */ + cancel_delayed_work_sync(&mp->m_sync_work); + /* * Just warn here till VFS can correctly support * read-only remount without racing. @@ -379,9 +385,8 @@ xfs_syncd_queue_sync( } /* - * Every sync period we need to unpin all items in the AIL and push them to - * disk. If there is nothing dirty, then we might need to cover the log to - * indicate that the filesystem is idle and not frozen. + * Every sync period we need to push dirty metadata and try to cover the log + * to indicate the filesystem is idle and not frozen. */ void xfs_sync_worker( @@ -391,17 +396,15 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_writers.frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); + /* dgc: errors ignored here */ + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); - } + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); /* queue us up again */ xfs_syncd_queue_sync(mp); From f661f1e0bf5002bdcc8b5810ad0a184a1841537f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:02 +1100 Subject: [PATCH 04/78] xfs: sync work is now only periodic log work The only thing the periodic sync work does now is flush the AIL and idle the log. These are really functions of the log code, so move the work to xfs_log.c and rename it appropriately. The only wart that this leaves behind is the xfssyncd_centisecs sysctl, otherwise the xfssyncd is dead. Clean up any comments that related to xfssyncd to reflect it's passing. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 61 ++++++++++++++++++++++++++++++++++++------- fs/xfs/xfs_log.h | 3 +++ fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 16 +++--------- fs/xfs/xfs_sync.c | 39 +++------------------------ fs/xfs/xfs_sync.h | 2 -- 7 files changed, 62 insertions(+), 61 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7f4f9370d0e7..efea12bfbd6b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -34,6 +34,7 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_trace.h" +#include "xfs_fsops.h" kmem_zone_t *xfs_log_ticket_zone; @@ -679,25 +680,29 @@ out: } /* - * Finish the recovery of the file system. This is separate from - * the xfs_log_mount() call, because it depends on the code in - * xfs_mountfs() to read in the root and real-time bitmap inodes - * between calling xfs_log_mount() and here. + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. * - * mp - ubiquitous xfs mount point structure + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. */ int xfs_log_mount_finish(xfs_mount_t *mp) { - int error; + int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { error = xlog_recover_finish(mp->m_log); - else { - error = 0; + if (!error) + xfs_log_work_queue(mp); + } else { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } + return error; } @@ -858,7 +863,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) void xfs_log_unmount(xfs_mount_t *mp) { - cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_log->l_work); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } @@ -1161,6 +1166,40 @@ done: } /* xlog_get_iclog_buffer_size */ +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) + xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + /* * This routine initializes some of the log structure for a given mount point. * Its primary purpose is to fill in enough, so recovery can occur. However, @@ -1195,6 +1234,7 @@ xlog_alloc_log( log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ @@ -3700,3 +3740,4 @@ xlog_iclogs_empty( } while (iclog != log->l_iclog); return 1; } + diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 748d312850e2..26ed7de352d7 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -181,5 +181,8 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); +void xfs_log_work_queue(struct xfs_mount *mp); +void xfs_log_worker(struct work_struct *work); + #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 18a801d76a42..9a4e0e5ec322 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -495,6 +495,7 @@ struct xlog { struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index deee09e534dc..26e46aeaa3f1 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -197,7 +197,6 @@ typedef struct xfs_mount { struct mutex m_icsb_mutex; /* balancer sync lock */ #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ - struct delayed_work m_sync_work; /* background sync work */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct work_struct m_flush_work; /* background inode flush */ __int64_t m_update_flags; /* sb flags we need to update diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 20fa955d80d1..37c39a155a58 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1005,7 +1005,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - cancel_delayed_work_sync(&mp->m_sync_work); cancel_work_sync(&mp->m_flush_work); xfs_filestream_unmount(mp); @@ -1040,10 +1039,10 @@ xfs_fs_sync_fs( if (laptop_mode) { /* * The disk must be active because we're syncing. - * We schedule xfssyncd now (now that the disk is + * We schedule log work now (now that the disk is * active) instead of later (when it might not be). */ - flush_delayed_work(&mp->m_sync_work); + flush_delayed_work(&mp->m_log->l_work); } return 0; @@ -1200,7 +1199,7 @@ xfs_fs_remount( * value if it is non-zero, otherwise go with the default. */ xfs_restore_resvblks(mp); - xfs_syncd_queue_sync(mp); + xfs_log_work_queue(mp); } /* rw -> ro */ @@ -1246,7 +1245,7 @@ xfs_fs_unfreeze( struct xfs_mount *mp = XFS_M(sb); xfs_restore_resvblks(mp); - xfs_syncd_queue_sync(mp); + xfs_log_work_queue(mp); return 0; } @@ -1326,7 +1325,6 @@ xfs_fs_fill_super( mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_WORK(&mp->m_flush_work, xfs_flush_worker); - INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; @@ -1410,12 +1408,6 @@ xfs_fs_fill_super( goto out_unmount; } - /* - * The filesystem is successfully mounted, so we can start background - * sync work now. - */ - xfs_syncd_queue_sync(mp); - return 0; out_filestream_unmount: diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index e898d1807044..2174555aebb2 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -19,6 +19,7 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -344,8 +345,8 @@ xfs_quiesce_attr( /* flush all pending changes from the AIL */ xfs_ail_push_all_sync(mp->m_ail); - /* stop background sync work */ - cancel_delayed_work_sync(&mp->m_sync_work); + /* stop background log work */ + cancel_delayed_work_sync(&mp->m_log->l_work); /* * Just warn here till VFS can correctly support @@ -376,40 +377,6 @@ xfs_quiesce_attr( xfs_buf_unlock(mp->m_sb_bp); } -void -xfs_syncd_queue_sync( - struct xfs_mount *mp) -{ - queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, - msecs_to_jiffies(xfs_syncd_centisecs * 10)); -} - -/* - * Every sync period we need to push dirty metadata and try to cover the log - * to indicate the filesystem is idle and not frozen. - */ -void -xfs_sync_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_sync_work); - int error; - - /* dgc: errors ignored here */ - if (mp->m_super->s_writers.frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); - - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); - - /* queue us up again */ - xfs_syncd_queue_sync(mp); -} - /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 3f59e5bed66b..8d58fab72a10 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,8 +26,6 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -void xfs_syncd_queue_sync(struct xfs_mount *mp); -void xfs_sync_worker(struct work_struct *work); void xfs_flush_worker(struct work_struct *work); void xfs_reclaim_worker(struct work_struct *work); From cf2931db2d189ce0583be7ae880d7e3f8c15f623 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:03 +1100 Subject: [PATCH 05/78] xfs: Bring some sanity to log unmounting When unmounting the filesystem, there are lots of operations that need to be done in a specific order, and they are spread across across a couple of functions. We have to drain the AIL before we write the unmount record, and we have to shut down the background log work before we do either of them. But this is all split haphazardly across xfs_unmountfs() and xfs_log_unmount(). Move all the AIL flushing and log manipulations to xfs_log_unmount() so that the responisbilities of each function is clear and the operations they perform obvious. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 29 ++++++++++++++++++++++++++--- fs/xfs/xfs_mount.c | 24 ------------------------ 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index efea12bfbd6b..e788f39721e3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -855,15 +855,38 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Deallocate log structures for unmount/relocation. + * Shut down and release the AIL and Log. * - * We need to stop the aild from running before we destroy - * and deallocate the log as the aild references the log. + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. + * + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record, tear down the AIL and finally free the log. */ void xfs_log_unmount(xfs_mount_t *mp) { cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); + xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d9a31c6a0c53..c195ec85c725 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1459,13 +1459,6 @@ xfs_unmountfs( xfs_qm_unmount(mp); - /* - * Flush out the log synchronously so that we know for sure - * that nothing is pinned. This is important because bflush() - * will skip pinned buffers. - */ - xfs_log_force(mp, XFS_LOG_SYNC); - /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for @@ -1491,23 +1484,6 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); - - xfs_log_unmount_write(mp); xfs_log_unmount(mp); xfs_uuid_unmount(mp); From 9aa05000f2b7cab4be582afba64af10b2d74727e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:04 +1100 Subject: [PATCH 06/78] xfs: xfs_sync_data is redundant. We don't do any data writeback from XFS any more - the VFS is completely responsible for that, including for freeze. We can replace the remaining caller with a VFS level function that achieves the same thing, but without conflicting with current writeback work. This means we can remove the flush_work and xfs_flush_inodes() - the VFS functionality completely replaces the internal flush queue for doing this writeback work in a separate context to avoid stack overruns. This does have one complication - it cannot be called with page locks held. Hence move the flushing of delalloc space when ENOSPC occurs back up into xfs_file_aio_buffered_write when we don't hold any locks that will stall writeback. Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc conversion fast enough to prevent spurious ENOSPC whent here are hundreds of writers, thousands of small files and GBs of free RAM. Hence we need to use sync_sb_inodes() to block callers while we wait for writeback like the previous xfs_flush_inodes implementation did. That means we have to hold the s_umount lock here, but because this call can nest inside i_mutex (the parent directory in the create case, held by the VFS), we have to use down_read_trylock() to avoid potential deadlocks. In practice, this trylock will succeed on almost every attempt as unmount/remount type operations are exceedingly rare. Note: we always need to pass a count of zero to generic_file_buffered_write() as the previously written byte count. We only do this by accident before this patch by the virtue of ret always being zero when there are no errors. Make this explicit rather than needing to specifically zero ret in the ENOSPC retry case. Signed-off-by: Dave Chinner Tested-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 13 ++++---- fs/xfs/xfs_iomap.c | 23 ++++--------- fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 21 ++++++++++-- fs/xfs/xfs_super.h | 1 + fs/xfs/xfs_sync.c | 78 ------------------------------------------- fs/xfs/xfs_sync.h | 3 -- fs/xfs/xfs_vnodeops.c | 2 +- 8 files changed, 34 insertions(+), 108 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aa473fa640a2..daf4066c24b2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -728,16 +728,17 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, ret); + pos, &iocb->ki_pos, count, 0); + /* - * if we just got an ENOSPC, flush the inode now we aren't holding any - * page locks and retry *once* + * If we just got an ENOSPC, try to write back all dirty inodes to + * convert delalloc space to free up some of the excess reserved + * metadata space. */ if (ret == -ENOSPC && !enospc) { enospc = 1; - ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); - if (!ret) - goto write_retry; + xfs_flush_inodes(ip->i_mount); + goto write_retry; } current->backing_dev_info = NULL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 973dff6ad935..f858b903678e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -373,7 +373,7 @@ xfs_iomap_write_delay( xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, flushed = 0; + int prealloc; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -434,26 +434,17 @@ retry: } /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For - * ENOSPC, * flush all other inodes with delalloc blocks to free up - * some of the excess reserved metadata space. For both cases, retry + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry * without EOF preallocation. */ if (nimaps == 0) { trace_xfs_delalloc_enospc(ip, offset, count); - if (flushed) - return XFS_ERROR(error ? error : ENOSPC); - - if (error == ENOSPC) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inodes(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; } - - flushed = 1; - error = 0; - prealloc = 0; - goto retry; + return XFS_ERROR(error ? error : ENOSPC); } if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 26e46aeaa3f1..a54b5aa498d4 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -198,7 +198,6 @@ typedef struct xfs_mount { #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct work_struct m_flush_work; /* background inode flush */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 37c39a155a58..9468c6878463 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -882,6 +882,24 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_unwritten_workqueue); } +/* + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting + * for IO to complete so that we effectively throttle multiple callers to the + * rate at which IO is completing. + */ +void +xfs_flush_inodes( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -1005,8 +1023,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - cancel_work_sync(&mp->m_flush_work); - xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -1324,7 +1340,6 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); - INIT_WORK(&mp->m_flush_work, xfs_flush_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_super = sb; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 9de4a920ba05..bbe3d15a7904 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -74,6 +74,7 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); +extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 2174555aebb2..6a2ada379166 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -216,51 +216,6 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -STATIC int -xfs_sync_inode_data( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - struct inode *inode = VFS_I(ip); - struct address_space *mapping = inode->i_mapping; - int error = 0; - - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; - - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) { - if (flags & SYNC_TRYLOCK) - return 0; - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ? - 0 : XBF_ASYNC, FI_NONE); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return error; -} - -/* - * Write out pagecache data for the whole filesystem. - */ -STATIC int -xfs_sync_data( - struct xfs_mount *mp, - int flags) -{ - int error; - - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); - - error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); - if (error) - return XFS_ERROR(error); - - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); - return 0; -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -415,39 +370,6 @@ xfs_reclaim_worker( xfs_syncd_queue_reclaim(mp); } -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room. - * - * Queue a new data flush if there isn't one already in progress and - * wait for completion of the flush. This means that we only ever have one - * inode flush in progress no matter how many ENOSPC events are occurring and - * so will prevent the system from bogging down due to every concurrent - * ENOSPC event scanning all the active inodes in the system for writeback. - */ -void -xfs_flush_inodes( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - - queue_work(xfs_syncd_wq, &mp->m_flush_work); - flush_work(&mp->m_flush_work); -} - -void -xfs_flush_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(work, - struct xfs_mount, m_flush_work); - - xfs_sync_data(mp, SYNC_TRYLOCK); - xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 8d58fab72a10..0018e846f0dc 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,14 +26,11 @@ struct xfs_perag; extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ -void xfs_flush_worker(struct work_struct *work); void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); -void xfs_flush_inodes(struct xfs_inode *ip); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2a5c637344b4..14928564f106 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -777,7 +777,7 @@ xfs_create( XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(dp); + xfs_flush_inodes(mp); error = xfs_trans_reserve(tp, resblks, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); } From 5889608df35783590251cfd440fa5d48f1855179 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:05 +1100 Subject: [PATCH 07/78] xfs: syncd workqueue is no more With the syncd functions moved to the log and/or removed, the syncd workqueue is the only remaining bit left. It is used by the log covering/ail pushing work, as well as by the inode reclaim work. Given how cheap workqueues are these days, give the log and inode reclaim work their own work queues and kill the syncd work queue. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_mount.h | 2 ++ fs/xfs/xfs_super.c | 38 ++++++++++++++++++-------------------- fs/xfs/xfs_sync.c | 20 +++++++++----------- fs/xfs/xfs_sync.h | 2 -- 5 files changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e788f39721e3..b6ce4d4b6def 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1193,7 +1193,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work, + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a54b5aa498d4..7c417b6b99ee 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -207,6 +207,8 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9468c6878463..27d5a92e1210 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -863,8 +863,23 @@ xfs_init_mount_workqueues( WQ_MEM_RECLAIM, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + return 0; +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: @@ -877,6 +892,8 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -1391,10 +1408,6 @@ xfs_fs_fill_super( /* * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. - * For the same reason we must also initialise the syncd and register - * the inode cache shrinker so that inodes can be reclaimed during - * operations like a quotacheck that iterate all inodes in the - * filesystem. */ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; @@ -1638,16 +1651,6 @@ xfs_destroy_zones(void) STATIC int __init xfs_init_workqueues(void) { - /* - * We never want to the same work item to run twice, reclaiming inodes - * or idling the log is not going to get any faster by multiple CPUs - * competing for ressources. Use the default large max_active value - * so that even lots of filesystems can perform these task in parallel. - */ - xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0); - if (!xfs_syncd_wq) - return -ENOMEM; - /* * The allocation workqueue can be used in memory reclaim situations * (writepage path), and parallelism is only limited by the number of @@ -1656,20 +1659,15 @@ xfs_init_workqueues(void) */ xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); if (!xfs_alloc_wq) - goto out_destroy_syncd; + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { destroy_workqueue(xfs_alloc_wq); - destroy_workqueue(xfs_syncd_wq); } STATIC int __init diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 6a2ada379166..15be21f074fd 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -40,8 +40,6 @@ #include #include -struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ - /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -335,18 +333,18 @@ xfs_quiesce_attr( /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs syncd work default of 30s. Perhaps this should have it's own + * on the xfs periodic sync default of 30s. Perhaps this should have it's own * tunable, but that can be done if this method proves to be ineffective or too * aggressive. */ static void -xfs_syncd_queue_reclaim( +xfs_reclaim_work_queue( struct xfs_mount *mp) { rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } rcu_read_unlock(); @@ -367,7 +365,7 @@ xfs_reclaim_worker( struct xfs_mount, m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); } void @@ -388,7 +386,7 @@ __xfs_inode_set_reclaim_tag( spin_unlock(&ip->i_mount->m_perag_lock); /* schedule periodic background inode reclaim */ - xfs_syncd_queue_reclaim(ip->i_mount); + xfs_reclaim_work_queue(ip->i_mount); trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, -1, _RET_IP_); @@ -646,9 +644,9 @@ out: /* * We could return EAGAIN here to make reclaim rescan the inode tree in * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. */ return 0; } @@ -804,7 +802,7 @@ xfs_reclaim_inodes_nr( int nr_to_scan) { /* kick background reclaimer and push the AIL */ - xfs_syncd_queue_reclaim(mp); + xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 0018e846f0dc..0beabea99e73 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -24,8 +24,6 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ -extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ - void xfs_reclaim_worker(struct work_struct *work); int xfs_quiesce_data(struct xfs_mount *mp); From 34061f5c420561dd42addd252811a1fa4b0ac69b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:06 +1100 Subject: [PATCH 08/78] xfs: xfs_sync_fsdata is redundant Why do we need to write the superblock to disk once we've written all the data? We don't actually - the reasons for doing this are lost in the mists of time, and go back to the way Irix used to drive VFS flushing. On linux, this code is only called from two contexts: remount and .sync_fs. In the remount case, the call is followed by a metadata sync, which unpins and writes the superblock. In the sync_fs case, we only need to force the log to disk to ensure that the superblock is correctly on disk, so we don't actually need to write it. Hence the functionality is either redundant or superfluous and thus can be removed. Seeing as xfs_quiesce_data is essentially now just a log force, remove it as well and fold the code back into the two callers. Neither of them need the log covering check, either, as that is redundant for the remount case, and unnecessary for the .sync_fs case. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 19 +++++-------- fs/xfs/xfs_sync.c | 67 ++++++---------------------------------------- 2 files changed, 14 insertions(+), 72 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 27d5a92e1210..b5e445a13f7b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1057,7 +1057,6 @@ xfs_fs_sync_fs( int wait) { struct xfs_mount *mp = XFS_M(sb); - int error; /* * Doing anything during the async pass would be counterproductive. @@ -1065,10 +1064,7 @@ xfs_fs_sync_fs( if (!wait) return 0; - error = xfs_quiesce_data(mp); - if (error) - return -error; - + xfs_log_force(mp, XFS_LOG_SYNC); if (laptop_mode) { /* * The disk must be active because we're syncing. @@ -1238,15 +1234,12 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { /* - * After we have synced the data but before we sync the - * metadata, we need to free up the reserve block pool so that - * the used block count in the superblock on disk is correct at - * the end of the remount. Stash the current reserve pool size - * so that if we get remounted rw, we can return it to the same - * size. + * Before we sync the metadata, we need to free up the reserve + * block pool so that the used block count in the superblock on + * disk is correct at the end of the remount. Stash the current + * reserve pool size so that if we get remounted rw, we can + * return it to the same size. */ - - xfs_quiesce_data(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 15be21f074fd..581eb59a85b5 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -214,70 +214,16 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -STATIC int -xfs_sync_fsdata( - struct xfs_mount *mp) -{ - struct xfs_buf *bp; - int error; - - /* - * If the buffer is pinned then push on the log so we won't get stuck - * waiting in the write for someone, maybe ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have the - * superblock buffer locked at that point so it can become pinned in - * between there and here. - */ - bp = xfs_getsb(mp, 0); - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - return error; -} - -/* - * When remounting a filesystem read-only or freezing the filesystem, we have - * two phases to execute. This first phase is syncing the data before we - * quiesce the filesystem, and the second is flushing all the inodes out after - * we've waited for all the transactions created by the first phase to - * complete. The second phase ensures that the inodes are written to their - * location on disk rather than just existing in transactions in the log. This - * means after a quiesce there is no log replay required to write the inodes to - * disk (this is the main difference between a sync and a quiesce). - */ -/* - * First stage of freeze - no writers will make progress now we are here, - * so we flush delwri and delalloc buffers here, then wait for all I/O to - * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother emptying the AIL - * because it'll just get dirty again. - */ -int -xfs_quiesce_data( - struct xfs_mount *mp) -{ - int error, error2 = 0; - - /* force out the log */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp); - - /* mark the log as covered if needed */ - if (xfs_log_need_covered(mp)) - error2 = xfs_fs_log_dummy(mp); - - return error ? error : error2; -} - /* * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to * wait for any remaining transactions to drain out before proceeding. * + * The second phase ensures that the inodes are written to their + * location on disk rather than just existing in transactions in the log. This + * means after a quiesce there is no log replay required to write the inodes to + * disk (this is the main difference between a sync and a quiesce). + * * Note: this stops background sync work - the callers must ensure it is started * again when appropriate. */ @@ -291,6 +237,9 @@ xfs_quiesce_attr( while (atomic_read(&mp->m_active_trans) > 0) delay(100); + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + /* reclaim inodes to do any IO before the freeze completes */ xfs_reclaim_inodes(mp, 0); xfs_reclaim_inodes(mp, SYNC_WAIT); From c7eea6f7adca4501d2c2db7f0f7c9dc88efac95e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:07 +1100 Subject: [PATCH 09/78] xfs: move xfs_quiesce_attr() into xfs_super.c Both callers of xfs_quiesce_attr() are in xfs_super.c, and there's nothing really sync-specific about this functionality so it doesn't really matter where it lives. Move it to benext to it's callers, so all the remount/sync_fs code is in the one place. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_sync.c | 65 -------------------------------------------- fs/xfs/xfs_sync.h | 3 --- 3 files changed, 67 insertions(+), 68 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b5e445a13f7b..3bafe66227fb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1148,6 +1148,73 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } +/* + * Trigger writeback of all the dirty metadata in the file system. + * + * This ensures that the metadata is written to their location on disk rather + * than just existing in transactions in the log. This means after a quiesce + * there is no log replay required to write the inodes to disk (this is the main + * difference between a sync and a quiesce). + * + * This shoul deffectively mimic the code in xfs_unmountfs() and + * xfs_log_umount() but without tearing down any structures. + * XXX: bug fixes needed! + * + * Note: this stops background log work - the callers must ensure it is started + * again when appropriate. + */ +void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* flush all pending changes from the AIL */ + xfs_ail_push_all_sync(mp->m_ail); + + /* stop background log work */ + cancel_delayed_work_sync(&mp->m_log->l_work); + + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + xfs_log_unmount_write(mp); + + /* + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * The superblock buffer is uncached and xfsaild_push() will lock and + * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() + * here but a lock on the superblock buffer will block until iodone() + * has completed. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); +} + STATIC int xfs_fs_remount( struct super_block *sb, diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 581eb59a85b5..7b630288bab5 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -214,71 +214,6 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -/* - * Second stage of a quiesce. The data is already synced, now we have to take - * care of the metadata. New transactions are already blocked, so we need to - * wait for any remaining transactions to drain out before proceeding. - * - * The second phase ensures that the inodes are written to their - * location on disk rather than just existing in transactions in the log. This - * means after a quiesce there is no log replay required to write the inodes to - * disk (this is the main difference between a sync and a quiesce). - * - * Note: this stops background sync work - the callers must ensure it is started - * again when appropriate. - */ -void -xfs_quiesce_attr( - struct xfs_mount *mp) -{ - int error = 0; - - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* force the log to unpin objects from the now complete transactions */ - xfs_log_force(mp, XFS_LOG_SYNC); - - /* reclaim inodes to do any IO before the freeze completes */ - xfs_reclaim_inodes(mp, 0); - xfs_reclaim_inodes(mp, SYNC_WAIT); - - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* stop background log work */ - cancel_delayed_work_sync(&mp->m_log->l_work); - - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); -} - /* * Queue a new inode reclaim pass if there are reclaimable inodes and there * isn't a reclaim pass already in progress. By default it runs every 5s based diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 0beabea99e73..0ba9c89c316e 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -26,9 +26,6 @@ struct xfs_perag; void xfs_reclaim_worker(struct work_struct *work); -int xfs_quiesce_data(struct xfs_mount *mp); -void xfs_quiesce_attr(struct xfs_mount *mp); - int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); From c75921a72a7c4bb73a5e09a697a672722e5543f1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:08 +1100 Subject: [PATCH 10/78] xfs: xfs_quiesce_attr() should quiesce the log like unmount xfs_quiesce_attr() is supposed to leave the log empty with an unmount record written. Right now it does not wait for the AIL to be emptied before writing the unmount record, not does it wait for metadata IO completion, either. Fix it to use the same method and code as xfs_log_unmount(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 25 ++++++++++++++++++------- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_super.c | 45 ++++++++++----------------------------------- 3 files changed, 29 insertions(+), 42 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b6ce4d4b6def..d2d59692739f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -855,20 +855,17 @@ xfs_log_unmount_write(xfs_mount_t *mp) } /* xfs_log_unmount_write */ /* - * Shut down and release the AIL and Log. - * - * During unmount, we need to ensure we flush all the dirty metadata objects - * from the AIL so that the log is empty before we write the unmount record to - * the log. + * Empty the log for unmount/freeze. * * To do this, we first need to shut down the background log work so it is not * trying to cover the log as we clean up. We then need to unpin all objects in * the log so we can then flush them out. Once they have completed their IO and * run the callbacks removing themselves from the AIL, we can write the unmount - * record, tear down the AIL and finally free the log. + * record. */ void -xfs_log_unmount(xfs_mount_t *mp) +xfs_log_quiesce( + struct xfs_mount *mp) { cancel_delayed_work_sync(&mp->m_log->l_work); xfs_log_force(mp, XFS_LOG_SYNC); @@ -886,6 +883,20 @@ xfs_log_unmount(xfs_mount_t *mp) xfs_buf_unlock(mp->m_sb_bp); xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. + */ +void +xfs_log_unmount( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 26ed7de352d7..5caee96059df 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -183,6 +183,7 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_worker(struct work_struct *work); +void xfs_log_quiesce(struct xfs_mount *mp); #endif #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3bafe66227fb..fdedf2cabae3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1153,15 +1153,11 @@ xfs_restore_resvblks(struct xfs_mount *mp) * * This ensures that the metadata is written to their location on disk rather * than just existing in transactions in the log. This means after a quiesce - * there is no log replay required to write the inodes to disk (this is the main - * difference between a sync and a quiesce). + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. * - * This shoul deffectively mimic the code in xfs_unmountfs() and - * xfs_log_umount() but without tearing down any structures. - * XXX: bug fixes needed! - * - * Note: this stops background log work - the callers must ensure it is started - * again when appropriate. + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. */ void xfs_quiesce_attr( @@ -1180,39 +1176,18 @@ xfs_quiesce_attr( xfs_reclaim_inodes(mp, 0); xfs_reclaim_inodes(mp, SYNC_WAIT); - /* flush all pending changes from the AIL */ - xfs_ail_push_all_sync(mp->m_ail); - - /* stop background log work */ - cancel_delayed_work_sync(&mp->m_log->l_work); - + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); /* * Just warn here till VFS can correctly support * read-only remount without racing. */ WARN_ON(atomic_read(&mp->m_active_trans) != 0); - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - - /* - * At this point we might have modified the superblock again and thus - * added an item to the AIL, thus flush it again. - */ - xfs_ail_push_all_sync(mp->m_ail); - - /* - * The superblock buffer is uncached and xfsaild_push() will lock and - * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait() - * here but a lock on the superblock buffer will block until iodone() - * has completed. - */ - xfs_buf_lock(mp->m_sb_bp); - xfs_buf_unlock(mp->m_sb_bp); + xfs_log_quiesce(mp); } STATIC int From 6d8b79cfca39399ef9115fb65dde85993455c9a3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:09 +1100 Subject: [PATCH 11/78] xfs: rename xfs_sync.[ch] to xfs_icache.[ch] xfs_sync.c now only contains inode reclaim functions and inode cache iteration functions. It is not related to sync operations anymore. Rename to xfs_icache.c to reflect it's contents and prepare for consolidation with the other inode cache file that exists (xfs_iget.c). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Makefile | 2 +- fs/xfs/{xfs_sync.c => xfs_icache.c} | 1 + fs/xfs/{xfs_sync.h => xfs_icache.h} | 0 fs/xfs/xfs_iget.c | 1 + fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 2 -- fs/xfs/xfs_qm_syscalls.c | 1 + fs/xfs/xfs_super.c | 2 +- 8 files changed, 6 insertions(+), 4 deletions(-) rename fs/xfs/{xfs_sync.c => xfs_icache.c} (99%) rename fs/xfs/{xfs_sync.h => xfs_icache.h} (100%) diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d2bf974b1a2f..442f256dbcac 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -39,6 +39,7 @@ xfs-y += xfs_aops.o \ xfs_fsops.o \ xfs_fs_subr.o \ xfs_globals.o \ + xfs_icache.o \ xfs_iget.o \ xfs_ioctl.o \ xfs_iomap.o \ @@ -47,7 +48,6 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mru_cache.o \ xfs_super.o \ - xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ xfs_utils.o \ diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c similarity index 99% rename from fs/xfs/xfs_sync.c rename to fs/xfs/xfs_icache.c index 7b630288bab5..eba216f11d5e 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_icache.c @@ -36,6 +36,7 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_fsops.h" +#include "xfs_icache.h" #include #include diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h similarity index 100% rename from fs/xfs/xfs_sync.h rename to fs/xfs/xfs_icache.h diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 784a803383ec..069c5ceb9459 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -38,6 +38,7 @@ #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c195ec85c725..6f1c997704cd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -42,6 +42,7 @@ #include "xfs_fsops.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_icache.h" #ifdef HAVE_PERCPU_SB diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7c417b6b99ee..a631ca3b9065 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations { #else /* __KERNEL__ */ -#include "xfs_sync.h" - struct xlog; struct xfs_inode; struct xfs_mru_cache; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 858a3b186110..7a9071f8855f 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fdedf2cabae3..3d9ea947e9f8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -49,7 +49,7 @@ #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" -#include "xfs_sync.h" +#include "xfs_icache.h" #include "xfs_trace.h" #include From fa96acadf1eb712fca6d59922ad93787c87e44ec Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:10 +1100 Subject: [PATCH 12/78] xfs: move inode locking functions to xfs_inode.c xfs_ilock() and friends really aren't related to the inode cache in any way, so move them to xfs_inode.c with all the other inode related functionality. While doing this move, move the xfs_ilock() tracepoints to *before* the lock is taken so that when a hang on a lock occurs we have events to indicate which process and what inode we were trying to lock when the hang occurred. This is much better than the current silence we get on a hang... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_iget.c | 251 --------------------------------------------- fs/xfs/xfs_inode.c | 250 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+), 251 deletions(-) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 069c5ceb9459..ea9a5fa49a48 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -453,254 +453,3 @@ out_error_or_again: return error; } -/* - * This is a wrapper routine around the xfs_ilock() routine - * used to centralize some grungy code. It is used in places - * that wish to lock the inode solely for reading the extents. - * The reason these places can't just call xfs_ilock(SHARED) - * is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode - * is in b-tree format, then we need to lock the inode exclusively - * until the extents are read in. Locking it exclusively all - * the time would limit our parallelism unnecessarily, though. - * What we do instead is check to see if the extents have been - * read in yet, and only lock the inode exclusively if they - * have not. - * - * The function returns a value which should be given to the - * corresponding xfs_iunlock_map_shared(). This value is - * the mode in which the lock was actually taken. - */ -uint -xfs_ilock_map_shared( - xfs_inode_t *ip) -{ - uint lock_mode; - - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { - lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - - xfs_ilock(ip, lock_mode); - - return lock_mode; -} - -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) -{ - xfs_iunlock(ip, lock_mode); -} - -/* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. - * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL - */ -void -xfs_ilock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - - if (lock_flags & XFS_ILOCK_EXCL) - mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - else if (lock_flags & XFS_ILOCK_SHARED) - mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - - trace_xfs_ilock(ip, lock_flags, _RET_IP_); -} - -/* - * This is just like xfs_ilock(), except that the caller - * is guaranteed not to sleep. It returns 1 if it gets - * the requested locks and 0 otherwise. If the IO lock is - * obtained but the inode lock cannot be, then the IO lock - * is dropped before returning. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks to be - * to be locked. See the comment for xfs_ilock() for a list - * of valid values. - */ -int -xfs_ilock_nowait( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) - goto out; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) - goto out; - } - if (lock_flags & XFS_ILOCK_EXCL) { - if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; - } else if (lock_flags & XFS_ILOCK_SHARED) { - if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; - } - trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); - return 1; - - out_undo_iolock: - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - out: - return 0; -} - -/* - * xfs_iunlock() is used to drop the inode locks acquired with - * xfs_ilock() and xfs_ilock_nowait(). The caller must pass - * in the flags given to xfs_ilock() or xfs_ilock_nowait() so - * that we know which locks to drop. - * - * ip -- the inode being unlocked - * lock_flags -- this parameter indicates the inode's locks to be - * to be unlocked. See the comment for xfs_ilock() for a list - * of valid values for this parameter. - * - */ -void -xfs_iunlock( - xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - ASSERT(lock_flags != 0); - - if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); - else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); - - if (lock_flags & XFS_ILOCK_EXCL) - mrunlock_excl(&ip->i_lock); - else if (lock_flags & XFS_ILOCK_SHARED) - mrunlock_shared(&ip->i_lock); - - trace_xfs_iunlock(ip, lock_flags, _RET_IP_); -} - -/* - * give up write locks. the i/o lock cannot be held nested - * if it is being demoted. - */ -void -xfs_ilock_demote( - xfs_inode_t *ip, - uint lock_flags) -{ - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - - if (lock_flags & XFS_ILOCK_EXCL) - mrdemote(&ip->i_lock); - if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); - - trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); -} - -#ifdef DEBUG -int -xfs_isilocked( - xfs_inode_t *ip, - uint lock_flags) -{ - if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { - if (!(lock_flags & XFS_ILOCK_SHARED)) - return !!ip->i_lock.mr_writer; - return rwsem_is_locked(&ip->i_lock.mr_lock); - } - - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); - } - - ASSERT(0); - return 0; -} -#endif - -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wait); -} diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2778258fcfa2..ba404e4b9f0c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -74,6 +74,256 @@ xfs_get_extsz_hint( return 0; } +/* + * This is a wrapper routine around the xfs_ilock() routine used to centralize + * some grungy code. It is used in places that wish to lock the inode solely + * for reading the extents. The reason these places can't just call + * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the + * extents from disk for a file in b-tree format. If the inode is in b-tree + * format, then we need to lock the inode exclusively until the extents are read + * in. Locking it exclusively all the time would limit our parallelism + * unnecessarily, though. What we do instead is check to see if the extents + * have been read in yet, and only lock the inode exclusively if they have not. + * + * The function returns a value which should be given to the corresponding + * xfs_iunlock_map_shared(). This value is the mode in which the lock was + * actually taken. + */ +uint +xfs_ilock_map_shared( + xfs_inode_t *ip) +{ + uint lock_mode; + + if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && + ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + lock_mode = XFS_ILOCK_EXCL; + } else { + lock_mode = XFS_ILOCK_SHARED; + } + + xfs_ilock(ip, lock_mode); + + return lock_mode; +} + +/* + * This is simply the unlock routine to go with xfs_ilock_map_shared(). + * All it does is call xfs_iunlock() with the given lock_mode. + */ +void +xfs_iunlock_map_shared( + xfs_inode_t *ip, + unsigned int lock_mode) +{ + xfs_iunlock(ip, lock_mode); +} + +/* + * The xfs inode contains 2 locks: a multi-reader lock called the + * i_iolock and a multi-reader lock called the i_lock. This routine + * allows either or both of the locks to be obtained. + * + * The 2 locks should always be ordered so that the IO lock is + * obtained first in order to prevent deadlock. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks + * to be locked. It can be: + * XFS_IOLOCK_SHARED, + * XFS_IOLOCK_EXCL, + * XFS_ILOCK_SHARED, + * XFS_ILOCK_EXCL, + * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, + * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, + * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, + * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + */ +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; + } + return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} + +#ifdef DEBUG +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); + } + + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); + } + + ASSERT(0); + return 0; +} +#endif + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer From 33479e0542df066fb0b47df18780e93bfe6e0dc5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:11 +1100 Subject: [PATCH 13/78] xfs: remove xfs_iget.c The inode cache functions remaining in xfs_iget.c can be moved to xfs_icache.c along with the other inode cache functions. This removes all functionality from xfs_iget.c, so the file can simply be removed. This move results in various functions now only having the scope of a single file (e.g. xfs_inode_free()), so clean up all the definitions and exported prototypes in xfs_icache.[ch] and xfs_inode.h appropriately. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Makefile | 1 - fs/xfs/xfs_export.c | 1 + fs/xfs/xfs_icache.c | 421 +++++++++++++++++++++++++++++++++++- fs/xfs/xfs_icache.h | 6 +- fs/xfs/xfs_iget.c | 455 --------------------------------------- fs/xfs/xfs_inode.c | 1 + fs/xfs/xfs_inode.h | 10 +- fs/xfs/xfs_itable.c | 1 + fs/xfs/xfs_log_recover.c | 1 + fs/xfs/xfs_qm.c | 1 + fs/xfs/xfs_rtalloc.c | 1 + fs/xfs/xfs_vnodeops.c | 1 + 12 files changed, 430 insertions(+), 470 deletions(-) delete mode 100644 fs/xfs/xfs_iget.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 442f256dbcac..e65357bb3dc6 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -40,7 +40,6 @@ xfs-y += xfs_aops.o \ xfs_fs_subr.o \ xfs_globals.o \ xfs_icache.o \ - xfs_iget.o \ xfs_ioctl.o \ xfs_iomap.o \ xfs_iops.o \ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 8c6d1d70278c..a83611849cee 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * Note that we only accept fileids which are long enough rather than allow diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index eba216f11d5e..9c8703b5cd72 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -41,6 +41,421 @@ #include #include +STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip); + +/* + * Allocate and initialise an xfs_inode. + */ +STATIC struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + kmem_zone_free(xfs_inode_zone, ip); +} + +STATIC void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. Since we can be called from transaction context, don't + * recurse into the file system. + */ + if (radix_tree_preload(GFP_NOFS)) { + error = EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = ip->i_gdquot = NULL; + xfs_iflags_set(ip, iflags); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_inode(ip); + return 0; + +out_error_or_again: + if (error == EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} + /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -253,7 +668,7 @@ xfs_reclaim_worker( xfs_reclaim_work_queue(mp); } -void +static void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, struct xfs_inode *ip) @@ -319,7 +734,7 @@ __xfs_inode_clear_reclaim( } } -void +STATIC void __xfs_inode_clear_reclaim_tag( xfs_mount_t *mp, xfs_perag_t *pag, @@ -542,7 +957,7 @@ out: * then a shut down during filesystem unmount reclaim walk leak all the * unreclaimed inodes. */ -int +STATIC int xfs_reclaim_inodes_ag( struct xfs_mount *mp, int flags, diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 0ba9c89c316e..222e22f16b4a 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -24,6 +24,9 @@ struct xfs_perag; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ +int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); + void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); @@ -31,9 +34,6 @@ int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); -void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, - struct xfs_inode *ip); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c deleted file mode 100644 index ea9a5fa49a48..000000000000 --- a/fs/xfs/xfs_iget.c +++ /dev/null @@ -1,455 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_acl.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_trans_priv.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_trace.h" -#include "xfs_icache.h" - - -/* - * Allocate and initialise an xfs_inode. - */ -STATIC struct xfs_inode * -xfs_inode_alloc( - struct xfs_mount *mp, - xfs_ino_t ino) -{ - struct xfs_inode *ip; - - /* - * if this didn't occur in transactions, we could use - * KM_MAYFAIL and return NULL here on ENOMEM. Set the - * code up to do this anyway. - */ - ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); - if (!ip) - return NULL; - if (inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); - return NULL; - } - - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - ASSERT(ip->i_ino == 0); - - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - - /* initialise the xfs inode */ - ip->i_ino = ino; - ip->i_mount = mp; - memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); - ip->i_afp = NULL; - memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); - ip->i_flags = 0; - ip->i_delayed_blks = 0; - memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); - - return ip; -} - -STATIC void -xfs_inode_free_callback( - struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - struct xfs_inode *ip = XFS_I(inode); - - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - if (ip->i_itemp) { - ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); - xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; - } - - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(!xfs_isiflocked(ip)); - - /* - * Because we use RCU freeing we need to ensure the inode always - * appears to be reclaimed with an invalid inode number when in the - * free state. The ip->i_flags_lock provides the barrier against lookup - * races. - */ - spin_lock(&ip->i_flags_lock); - ip->i_flags = XFS_IRECLAIM; - ip->i_ino = 0; - spin_unlock(&ip->i_flags_lock); - - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); -} - -/* - * Check the validity of the inode we just found it the cache - */ -static int -xfs_iget_cache_hit( - struct xfs_perag *pag, - struct xfs_inode *ip, - xfs_ino_t ino, - int flags, - int lock_flags) __releases(RCU) -{ - struct inode *inode = VFS_I(ip); - struct xfs_mount *mp = ip->i_mount; - int error; - - /* - * check for re-use of an inode within an RCU grace period due to the - * radix tree nodes not being updated yet. We monitor for this by - * setting the inode number to zero before freeing the inode structure. - * If the inode has been reallocated and set up, then the inode number - * will not match, so check for that, too. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - - /* - * If we are racing with another cache hit that is currently - * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete - * before continuing. - * - * XXX(hch): eventually we should do something equivalent to - * wait_on_inode to wait for these flags to be cleared - * instead of polling for it. - */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); - error = EAGAIN; - goto out_error; - } - - /* - * If lookup is racing with unlink return an error immediately. - */ - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } - - /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ - if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - - /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - - error = -inode_init_always(mp->m_super, inode); - if (error) { - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - inode->i_state = I_NEW; - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - } else { - /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = EAGAIN; - goto out_error; - } - - /* We've got a live one. */ - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - trace_xfs_iget_hit(ip); - } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); - XFS_STATS_INC(xs_ig_found); - - return 0; - -out_error: - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - return error; -} - - -static int -xfs_iget_cache_miss( - struct xfs_mount *mp, - struct xfs_perag *pag, - xfs_trans_t *tp, - xfs_ino_t ino, - struct xfs_inode **ipp, - int flags, - int lock_flags) -{ - struct xfs_inode *ip; - int error; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - int iflags; - - ip = xfs_inode_alloc(mp, ino); - if (!ip) - return ENOMEM; - - error = xfs_iread(mp, tp, ip, flags); - if (error) - goto out_destroy; - - trace_xfs_iget_miss(ip); - - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_destroy; - } - - /* - * Preload the radix tree so we can insert safely under the - * write spinlock. Note that we cannot sleep inside the preload - * region. Since we can be called from transaction context, don't - * recurse into the file system. - */ - if (radix_tree_preload(GFP_NOFS)) { - error = EAGAIN; - goto out_destroy; - } - - /* - * Because the inode hasn't been added to the radix-tree yet it can't - * be found by another thread, so we can do the non-sleeping lock here. - */ - if (lock_flags) { - if (!xfs_ilock_nowait(ip, lock_flags)) - BUG(); - } - - /* - * These values must be set before inserting the inode into the radix - * tree as the moment it is inserted a concurrent lookup (allowed by the - * RCU locking mechanism) can find it and that lookup must see that this - * is an inode currently under construction (i.e. that XFS_INEW is set). - * The ip->i_flags_lock that protects the XFS_INEW flag forms the - * memory barrier that ensures this detection works correctly at lookup - * time. - */ - iflags = XFS_INEW; - if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; - ip->i_udquot = ip->i_gdquot = NULL; - xfs_iflags_set(ip, iflags); - - /* insert the new inode */ - spin_lock(&pag->pag_ici_lock); - error = radix_tree_insert(&pag->pag_ici_root, agino, ip); - if (unlikely(error)) { - WARN_ON(error != -EEXIST); - XFS_STATS_INC(xs_ig_dup); - error = EAGAIN; - goto out_preload_end; - } - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - - *ipp = ip; - return 0; - -out_preload_end: - spin_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - if (lock_flags) - xfs_iunlock(ip, lock_flags); -out_destroy: - __destroy_inode(VFS_I(ip)); - xfs_inode_free(ip); - return error; -} - -/* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, initialise the vfs inode - * if necessary. - * - * If it is not in core, read it in from the file system's device, - * add it to the cache and initialise the vfs inode. - * - * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. - */ -int -xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) -{ - xfs_inode_t *ip; - int error; - xfs_perag_t *pag; - xfs_agino_t agino; - - /* - * xfs_reclaim_inode() uses the ILOCK to ensure an inode - * doesn't get freed while it's being referenced during a - * radix tree traversal here. It assumes this function - * aqcuires only the ILOCK (and therefore it has no need to - * involve the IOLOCK in this synchronization). - */ - ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); - - /* reject inode numbers outside existing AGs */ - if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) - return EINVAL; - - /* get the perag structure and ensure that it's inode capable */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); - agino = XFS_INO_TO_AGINO(mp, ino); - -again: - error = 0; - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, agino); - - if (ip) { - error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); - if (error) - goto out_error_or_again; - } else { - rcu_read_unlock(); - XFS_STATS_INC(xs_ig_missed); - - error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, - flags, lock_flags); - if (error) - goto out_error_or_again; - } - xfs_perag_put(pag); - - *ipp = ip; - - /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) - * now. If it's a new inode being created, xfs_ialloc will handle it. - */ - if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); - return 0; - -out_error_or_again: - if (error == EAGAIN) { - delay(1); - goto again; - } - xfs_perag_put(pag); - return error; -} - diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ba404e4b9f0c..bba8f37525b3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -45,6 +45,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 94b32f906e79..1fc2065e010b 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ ((pip)->i_d.di_mode & S_ISGID)) + /* - * xfs_iget.c prototypes. + * xfs_inode.c prototypes. */ -int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - uint, uint, xfs_inode_t **); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); @@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_inode_free(struct xfs_inode *ip); - -/* - * xfs_inode.c prototypes. - */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, xfs_inode_t **); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 01d10a66e302..3998fd2a7949 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_btree.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xfs_internal_inum( diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5da3ace352bf..651c98859b04 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -42,6 +42,7 @@ #include "xfs_quota.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_icache.h" STATIC int xlog_find_zeroed( diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 2e86fa0cfc0d..48c750b0e830 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -40,6 +40,7 @@ #include "xfs_utils.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The global quota manager. There is only one of these for the entire diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ca28a4ba4b54..a69e0b4750a9 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -38,6 +38,7 @@ #include "xfs_utils.h" #include "xfs_trace.h" #include "xfs_buf.h" +#include "xfs_icache.h" /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 14928564f106..2ee1f49da0aa 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -47,6 +47,7 @@ #include "xfs_filestream.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_icache.h" /* * The maximum pathlen is 1024 bytes. Since the minimum file system From d35e88faa3b0fc2cea35c3b2dca358b5cd09b45f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 8 Oct 2012 21:56:12 +1100 Subject: [PATCH 14/78] xfs: only update the last_sync_lsn when a transaction completes The log write code stamps each iclog with the current tail LSN in the iclog header so that recovery knows where to find the tail of thelog once it has found the head. Normally this is taken from the first item on the AIL - the log item that corresponds to the oldest active item in the log. The problem is that when the AIL is empty, the tail lsn is dervied from the the l_last_sync_lsn, which is the LSN of the last iclog to be written to the log. In most cases this doesn't happen, because the AIL is rarely empty on an active filesystem. However, when it does, it opens up an interesting case when the transaction being committed to the iclog spans multiple iclogs. That is, the first iclog is stamped with the l_last_sync_lsn, and IO is issued. Then the next iclog is setup, the changes copied into the iclog (takes some time), and then the l_last_sync_lsn is stamped into the header and IO is issued. This is still the same transaction, so the tail lsn of both iclogs must be the same for log recovery to find the entire transaction to be able to replay it. The problem arises in that the iclog buffer IO completion updates the l_last_sync_lsn with it's own LSN. Therefore, If the first iclog completes it's IO before the second iclog is filled and has the tail lsn stamped in it, it will stamp the LSN of the first iclog into it's tail lsn field. If the system fails at this point, log recovery will not see a complete transaction, so the transaction will no be replayed. The fix is simple - the l_last_sync_lsn is updated when a iclog buffer IO completes, and this is incorrect. The l_last_sync_lsn shoul dbe updated when a transaction is completed by a iclog buffer IO. That is, only iclog buffers that have transaction commit callbacks attached to them should update the l_last_sync_lsn. This means that the last_sync_lsn will only move forward when a commit record it written, not in the middle of a large transaction that is rolling through multiple iclog buffers. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d2d59692739f..46b6986e39b0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2461,14 +2461,27 @@ xlog_state_do_callback( /* - * update the last_sync_lsn before we drop the + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the * icloglock to ensure we are the only one that * can update it. */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); } else ioerrors++; From a00416844b8f4b0106344bdfd90fe45a854b1d05 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 20 Sep 2012 13:16:45 -0500 Subject: [PATCH 15/78] xfs: zero allocation_args on the kernel stack Zero the kernel stack space that makes up the xfs_alloc_arg structures. Signed-off-by: Mark Tinguely Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 1 + fs/xfs/xfs_bmap.c | 3 +++ fs/xfs/xfs_ialloc.c | 1 + 3 files changed, 5 insertions(+) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 4f33c32affe3..0287f3b1b503 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1866,6 +1866,7 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 848ffa77707b..e1545ec2f7d2 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2437,6 +2437,7 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; @@ -3082,6 +3083,7 @@ xfs_bmap_extents_to_btree( * Convert to a btree with two levels, one record in root. */ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; args.firstblock = *firstblock; @@ -3237,6 +3239,7 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 445bf1aef31c..c5c4ef4f2bdb 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -250,6 +250,7 @@ xfs_ialloc_ag_alloc( /* boundary */ struct xfs_perag *pag; + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; From 2455881c0b52f87be539c4c7deab1afff4d8a560 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:58 +1000 Subject: [PATCH 16/78] xfs: introduce XFS_BMAPI_STACK_SWITCH Certain allocation paths through xfs_bmapi_write() are in situations where we have limited stack available. These are almost always in the buffered IO writeback path when convertion delayed allocation extents to real extents. The current stack switch occurs for userdata allocations, which means we also do stack switches for preallocation, direct IO and unwritten extent conversion, even those these call chains have never been implicated in a stack overrun. Hence, let's target just the single stack overun offended for stack switches. To do that, introduce a XFS_BMAPI_STACK_SWITCH flag that the caller can pass xfs_bmapi_write() to indicate it should switch stacks if it needs to do allocation. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_alloc.h | 1 + fs/xfs/xfs_bmap.c | 4 ++++ fs/xfs/xfs_bmap.h | 5 ++++- fs/xfs/xfs_iomap.c | 4 +++- 5 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0287f3b1b503..43f791bcd8b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2447,7 +2447,7 @@ xfs_alloc_vextent( { DECLARE_COMPLETION_ONSTACK(done); - if (!args->userdata) + if (!args->stack_switch) return __xfs_alloc_vextent(args); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 93be4a667ca1..ef7d4885dc2d 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -123,6 +123,7 @@ typedef struct xfs_alloc_arg { struct completion *done; struct work_struct work; int result; + char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e1545ec2f7d2..91259554df8b 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,6 +2441,7 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; + args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4675,6 +4676,9 @@ xfs_bmapi_allocate( return error; } + if (flags & XFS_BMAPI_STACK_SWITCH) + bma->stack_switch = 1; + error = xfs_bmap_alloc(bma); if (error) return error; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 803b56d7ce16..b68c598034c1 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,6 +77,7 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -85,7 +86,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } static inline int xfs_bmapi_aflag(int w) @@ -133,6 +135,7 @@ typedef struct xfs_bmalloca { char userdata;/* set if is user data */ char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ + char stack_switch; } xfs_bmalloca_t; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f858b903678e..a066cf1766ab 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -575,7 +575,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, 1, + count_fsb, + XFS_BMAPI_STACK_SWITCH, + &first_block, 1, imap, &nimaps, &free_list); if (error) goto trans_cancel; From e04426b9202bccd4cfcbc70b2fa2aeca1c86d8f5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 5 Oct 2012 11:06:59 +1000 Subject: [PATCH 17/78] xfs: move allocation stack switch up to xfs_bmapi_allocate Switching stacks are xfs_alloc_vextent can cause deadlocks when we run out of worker threads on the allocation workqueue. This can occur because xfs_bmap_btalloc can make multiple calls to xfs_alloc_vextent() and even if xfs_alloc_vextent() fails it can return with the AGF locked in the current allocation transaction. If we then need to make another allocation, and all the allocation worker contexts are exhausted because the are blocked waiting for the AGF lock, holder of the AGF cannot get it's xfs-alloc_vextent work completed to release the AGF. Hence allocation effectively deadlocks. To avoid this, move the stack switch one layer up to xfs_bmapi_allocate() so that all of the allocation attempts in a single switched stack transaction occur in a single worker context. This avoids the problem of an allocation being blocked waiting for a worker thread whilst holding the AGF. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 42 +------------------------------- fs/xfs/xfs_alloc.h | 4 ---- fs/xfs/xfs_bmap.c | 60 +++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_bmap.h | 4 ++++ 4 files changed, 54 insertions(+), 56 deletions(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 43f791bcd8b1..335206a9c698 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2208,7 +2208,7 @@ xfs_alloc_read_agf( * group or loop over the allocation groups to find the result. */ int /* error */ -__xfs_alloc_vextent( +xfs_alloc_vextent( xfs_alloc_arg_t *args) /* allocation argument structure */ { xfs_agblock_t agsize; /* allocation group size */ @@ -2418,46 +2418,6 @@ error0: return error; } -static void -xfs_alloc_vextent_worker( - struct work_struct *work) -{ - struct xfs_alloc_arg *args = container_of(work, - struct xfs_alloc_arg, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_alloc_vextent(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Data allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Metadata - * requests, OTOH, are generally from low stack usage paths, so avoid the - * context switch overhead here. - */ -int -xfs_alloc_vextent( - struct xfs_alloc_arg *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_alloc_vextent(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - return args->result; -} - /* * Free an extent. * Just break up the extent address and hand off to xfs_free_ag_extent diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index ef7d4885dc2d..feacb061bab7 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -120,10 +120,6 @@ typedef struct xfs_alloc_arg { char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ xfs_fsblock_t firstblock; /* io first block allocated */ - struct completion *done; - struct work_struct work; - int result; - char stack_switch; } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 91259554df8b..83d0cf3df930 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2441,7 +2441,6 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; - args.stack_switch = ap->stack_switch; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); @@ -4620,12 +4619,11 @@ xfs_bmapi_delay( STATIC int -xfs_bmapi_allocate( - struct xfs_bmalloca *bma, - int flags) +__xfs_bmapi_allocate( + struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; @@ -4658,25 +4656,25 @@ xfs_bmapi_allocate( * Indicate if this is the first user data in the file, or just any * user data. */ - if (!(flags & XFS_BMAPI_METADATA)) { + if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; } - bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; /* * Only want to do the alignment at the eof if it is userdata and * allocation length is larger than a stripe unit. */ if (mp->m_dalign && bma->length >= mp->m_dalign && - !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { error = xfs_bmap_isaeof(bma, whichfork); if (error) return error; } - if (flags & XFS_BMAPI_STACK_SWITCH) + if (bma->flags & XFS_BMAPI_STACK_SWITCH) bma->stack_switch = 1; error = xfs_bmap_alloc(bma); @@ -4713,7 +4711,7 @@ xfs_bmapi_allocate( * A wasdelay extent has been initialized, so shouldn't be flagged * as unwritten. */ - if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) && + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4741,6 +4739,45 @@ xfs_bmapi_allocate( return 0; } +static void +xfs_bmapi_allocate_worker( + struct work_struct *work) +{ + struct xfs_bmalloca *args = container_of(work, + struct xfs_bmalloca, work); + unsigned long pflags; + + /* we are in a transaction context here */ + current_set_flags_nested(&pflags, PF_FSTRANS); + + args->result = __xfs_bmapi_allocate(args); + complete(args->done); + + current_restore_flags_nested(&pflags, PF_FSTRANS); +} + +/* + * Some allocation requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. Otherwise just + * call directly to avoid the context switch overhead here. + */ +int +xfs_bmapi_allocate( + struct xfs_bmalloca *args) +{ + DECLARE_COMPLETION_ONSTACK(done); + + if (!args->stack_switch) + return __xfs_bmapi_allocate(args); + + + args->done = &done; + INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); + queue_work(xfs_alloc_wq, &args->work); + wait_for_completion(&done); + return args->result; +} + STATIC int xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, @@ -4926,6 +4963,7 @@ xfs_bmapi_write( bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; bma.offset = bno; + bma.flags = flags; /* * There's a 32/64 bit type mismatch between the @@ -4941,7 +4979,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); - error = xfs_bmapi_allocate(&bma, flags); + error = xfs_bmapi_allocate(&bma); if (error) goto error0; if (bma.blkno == NULLFSBLOCK) diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index b68c598034c1..5f469c3516eb 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -136,6 +136,10 @@ typedef struct xfs_bmalloca { char aeof; /* allocated space at eof */ char conv; /* overwriting unwritten extents */ char stack_switch; + int flags; + struct completion *done; + struct work_struct work; + int result; } xfs_bmalloca_t; /* From 1375cb65e87b327a8dd4f920c3e3d837fb40e9c2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 9 Oct 2012 14:50:52 +1100 Subject: [PATCH 18/78] xfs: growfs: don't read garbage for new secondary superblocks When updating new secondary superblocks in a growfs operation, the superblock buffer is read from the newly grown region of the underlying device. This is not guaranteed to be zero, so violates the underlying assumption that the unused parts of superblocks are zero filled. Get a new buffer for these secondary superblocks to ensure that the unused regions are zero filled correctly. Signed-off-by: Dave Chinner Reviewed-by: Carlos Maiolino Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c25b094efbf7..4beaede43277 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -399,9 +399,26 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + else + error = ENOMEM; + } + if (error) { xfs_warn(mp, "error %d reading secondary superblock for ag %d", @@ -423,7 +440,7 @@ xfs_growfs_data_private( break; /* no point in continuing */ } } - return 0; + return error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); From 531c3bdc8662e1a83f8ec80dc3346b1284877c0a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 25 Oct 2012 17:22:30 +1100 Subject: [PATCH 19/78] xfs: silence uninitialised f.file warning. Uninitialised variable build warning introduced by 2903ff0 ("switch simple cases of fget_light to fdget"), gcc is not smart enough to work out that the variable is not used uninitialised, and the commit removed the initialisation at declaration that the old variable had. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8305f2ac6773..c1df3c623de2 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -70,7 +70,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f; + struct fd f = {0}; struct path path; int error; struct xfs_inode *ip; From cd856db69c88db438215244571957d812bdc6813 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Sat, 20 Oct 2012 11:08:19 -0300 Subject: [PATCH 20/78] xfs: Update inode alloc comments I found some out of date comments while studying the inode allocation code, so I believe it's worth to have these comments updated. It basically rewrites the comment regarding to "call_again" variable, which is not used anymore, but instead, callers of xfs_ialloc() decides if it needs to be called again relying only if ialloc_context is NULL or not. Also did some small changes in another comment that I thought to be pertinent to the current behaviour of these functions and some alignment on both comments. Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 6 +++--- fs/xfs/xfs_inode.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index c5c4ef4f2bdb..37753e1c8537 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -877,9 +877,9 @@ error0: * This function is designed to be called twice if it has to do an allocation * to make more free inodes. On the first call, *IO_agbp should be set to NULL. * If an inode is available without having to performn an allocation, an inode - * number is returned. In this case, *IO_agbp would be NULL. If an allocation - * needes to be done, xfs_dialloc would return the current AGI buffer in - * *IO_agbp. The caller should then commit the current transaction, allocate a + * number is returned. In this case, *IO_agbp is set to NULL. If an allocation + * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. + * The caller should then commit the current transaction, allocate a * new transaction, and call xfs_dialloc() again, passing in the previous value * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI * buffer is locked across the two calls, the second call is guaranteed to have diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bba8f37525b3..95f7a73b05cb 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1104,16 +1104,16 @@ xfs_iread_extents( * set according to the contents of the given cred structure. * * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() - * has a free inode available, call xfs_iget() - * to obtain the in-core version of the allocated inode. Finally, - * fill in the inode and log its initial contents. In this case, - * ialloc_context would be set to NULL and call_again set to false. + * has a free inode available, call xfs_iget() to obtain the in-core + * version of the allocated inode. Finally, fill in the inode and + * log its initial contents. In this case, ialloc_context would be + * set to NULL. * - * If xfs_dialloc() does not have an available inode, - * it will replenish its supply by doing an allocation. Since we can - * only do one allocation within a transaction without deadlocks, we - * must commit the current transaction before returning the inode itself. - * In this case, therefore, we will set call_again to true and return. + * If xfs_dialloc() does not have an available inode, it will replenish + * its supply by doing an allocation. Since we can only do one + * allocation within a transaction without deadlocks, we must commit + * the current transaction before returning the inode itself. + * In this case, therefore, we will set ialloc_context and return. * The caller should then commit the current transaction, start a new * transaction, and call xfs_ialloc() again to actually get the inode. * From c99abb8f560798625323c7b21d5636259017825d Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 18 Oct 2012 12:28:58 -0300 Subject: [PATCH 21/78] xfs: Update mount options documentation Once inode64 is the default allocation mode now, kernel documentation should be updated to match this behaviour. Signed-off-by: Carlos Maiolino Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- Documentation/filesystems/xfs.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 3fc0c31a6f5d..17187750270a 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -72,8 +72,15 @@ When mounting an XFS filesystem, the following options are accepted. Indicates that XFS is allowed to create inodes at any location in the filesystem, including those which will result in inode numbers occupying more than 32 bits of significance. This is - provided for backwards compatibility, but causes problems for - backup applications that cannot handle large inode numbers. + the default allocation option. Applications which do not handle + inode numbers bigger than 32 bits, should use inode32 option. + + inode32 + Indicates that XFS is limited to create inodes at locations which + will not result in inode numbers with more than 32 bits of + significance. This is provided for backwards compatibility, since + 64 bits inode numbers might cause problems for some applications + that cannot handle large inode numbers. largeio/nolargeio If "nolargeio" is specified, the optimal I/O reported in From 4c05f9ad4d168098b7ce3ffa7098283f94811ed6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:41 +1100 Subject: [PATCH 22/78] xfs: invalidate allocbt blocks moved to the free list When we free a block from the alloc btree tree, we move it to the freelist held in the AGFL and mark it busy in the busy extent tree. This typically happens when we merge btree blocks. Once the transaction is committed and checkpointed, the block can remain on the free list for an indefinite amount of time. Now, this isn't the end of the world at this point - if the free list is shortened, the buffer is invalidated in the transaction that moves it back to free space. If the buffer is allocated as metadata from the free list, then all the modifications getted logged, and we have no issues, either. And if it gets allocated as userdata direct from the freelist, it gets invalidated and so will never get written. However, during the time it sits on the free list, pressure on the log can cause the AIL to be pushed and the buffer that covers the block gets pushed for write. IOWs, we end up writing a freed metadata block to disk. Again, this isn't the end of the world because we know from the above we are only writing to free space. The problem, however, is for validation callbacks. If the block was on old btree root block, then the level of the block is going to be higher than the current tree root, and so will fail validation. There may be other inconsistencies in the block as well, and currently we don't care because the block is in free space. Shutting down the filesystem because a freed block doesn't pass write validation, OTOH, is rather unfriendly. So, make sure we always invalidate buffers as they move from the free space trees to the free list so that we guarantee they never get written to disk while on the free list. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f1647caace8f..f7876c6d6165 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -121,6 +121,8 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); return 0; } From b6aff29f3af7437635ec3d66af9115bb17ba561f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:42 +1100 Subject: [PATCH 23/78] xfs: don't vmap inode cluster buffers during free Inode buffers do not need to be mapped as inodes are read or written directly from/to the pages underlying the buffer. This fixes a regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the default behaviour"). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 95f7a73b05cb..965598eb308c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1760,7 +1760,8 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!bp) return ENOMEM; From 137fff09b7924507871f8e6294dfe57b7a880332 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 14:23:12 +1100 Subject: [PATCH 24/78] xfs: fix buffer shudown reference count mismatch When we shut down the filesystem, we have to unpin and free all the buffers currently active in the CIL. To do this we unpin and remove them in one operation as a result of a failed iclogbuf write. For buffers, we do this removal via a simultated IO completion of after marking the buffer stale. At the time we do this, we have two references to the buffer - the active LRU reference and the buf log item. The LRU reference is removed by marking the buffer stale, and the active CIL reference is by the xfs_buf_iodone() callback that is run by xfs_buf_do_callbacks() during ioend processing (via the bp->b_iodone callback). However, ioend processing requires one more reference - that of the IO that it is completing. We don't have this reference, so we free the buffer prematurely and use it after it is freed. For buffers marked with XBF_ASYNC, this leads to assert failures in xfs_buf_rele() on debug kernels because the b_hold count is zero. Fix this by making sure we take the necessary IO reference before starting IO completion processing on the stale buffer, and set the XBF_ASYNC flag to ensure that IO completion processing removes all the active references from the buffer to ensure it is fully torn down. Cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf_item.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a8d0ed911196..becf4a97efc6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -526,7 +526,25 @@ xfs_buf_item_unpin( } xfs_buf_relse(bp); } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; xfs_buf_ioerror(bp, EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); From 009507b052fa391618eccf9e8c9f484407fd9018 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 2 Nov 2012 11:38:44 +1100 Subject: [PATCH 25/78] xfs: fix reading of wrapped log data Commit 4439647 ("xfs: reset buffer pointers before freeing them") in 3.0-rc1 introduced a regression when recovering log buffers that wrapped around the end of log. The second part of the log buffer at the start of the physical log was being read into the header buffer rather than the data buffer, and hence recovery was seeing garbage in the data buffer when it got to the region of the log buffer that was incorrectly read. Cc: # 3.0.x, 3.2.x, 3.4.x 3.6.x Reported-by: Torsten Kaiser Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log_recover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 651c98859b04..3e06333d4bd1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3542,7 +3542,7 @@ xlog_do_recovery_pass( * - order is important. */ error = xlog_bread_offset(log, 0, - bblks - split_bblks, hbp, + bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; From 69a58a43f74eb2cb23d9bce2524dae33c289a40f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 9 Oct 2012 14:11:45 -0500 Subject: [PATCH 26/78] xfs: report projid32bit feature in geometry call When xfs gained the projid32bit feature, it was never added to the FSGEOMETRY ioctl feature flags, so it's not queryable without this patch. Signed-off-by: Eric Sandeen Reviewed-by: Carlos Maiolino Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 3 ++- fs/xfs/xfs_fsops.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index c13fed8c394a..0948c043443b 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 4beaede43277..7b0a997cf62b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -97,7 +97,9 @@ xfs_fs_geometry( (xfs_sb_version_haslazysbcount(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | (xfs_sb_version_hasattr2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | + (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; From 27b52867925e3aaed090063c1c58a7537e6373f3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:38 -0500 Subject: [PATCH 27/78] xfs: add EOFBLOCKS inode tagging/untagging Add the XFS_ICI_EOFBLOCKS_TAG inode tag to identify inodes with speculatively preallocated blocks beyond EOF. An inode is tagged when speculative preallocation occurs and untagged either via truncate down or when post-EOF blocks are freed via release or reclaim. The tag management is intentionally not aggressive to prefer simplicity over the complexity of handling all the corner cases under which post-EOF blocks could be freed (i.e., forward truncation, fallocate, write error conditions, etc.). This means that a tagged inode may or may not have post-EOF blocks after a period of time. The tag is eventually cleared when the inode is released or reclaimed. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_ag.h | 1 + fs/xfs/xfs_icache.c | 62 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 3 +++ fs/xfs/xfs_iomap.c | 8 ++++++ fs/xfs/xfs_iops.c | 4 +++ fs/xfs/xfs_trace.h | 5 ++++ fs/xfs/xfs_vnodeops.c | 2 ++ 7 files changed, 85 insertions(+) diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 44d65c1533c0..22bd4db011c8 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -233,6 +233,7 @@ typedef struct xfs_perag { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) #define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9c8703b5cd72..f9afc5ff0482 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1128,3 +1128,65 @@ xfs_reclaim_inodes_count( return reclaimable; } +void +xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_set_eofblocks_tag(ip); + + tagged = radix_tree_tagged(&pag->pag_ici_root, + XFS_ICI_EOFBLOCKS_TAG); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!tagged) { + /* propagate the eofblocks tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_clear_eofblocks_tag(ip); + + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + /* clear the eofblocks tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 222e22f16b4a..db3613075dc6 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -35,6 +35,9 @@ void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); + int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a066cf1766ab..add06b4e9a63 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -41,6 +41,7 @@ #include "xfs_utils.h" #include "xfs_iomap.h" #include "xfs_trace.h" +#include "xfs_icache.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -450,6 +451,13 @@ retry: if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_alert_fsblock_zero(ip, &imap[0]); + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + *ret_imap = imap[0]; return 0; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 4e00cf091d2c..81f5c4953287 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,6 +38,7 @@ #include "xfs_vnodeops.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include #include @@ -854,6 +855,9 @@ xfs_setattr_size( * and do not wait the usual (long) time for writeout. */ xfs_iflags_set(ip, XFS_ITRUNCATED); + + /* A truncate down always removes post-EOF blocks. */ + xfs_inode_clear_eofblocks_tag(ip); } if (mask & ATTR_CTIME) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7d36ccf57f93..6f46e034b766 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -130,6 +130,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, @@ -585,6 +587,9 @@ DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); DEFINE_INODE_EVENT(xfs_dquot_dqdetach); +DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 2ee1f49da0aa..e6e1d11dfdf2 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -238,6 +238,8 @@ xfs_free_eofblocks( } else { error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); From a454f7428ffa03c8e1321124d9074101b7290be6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:39 -0500 Subject: [PATCH 28/78] xfs: support a tag-based inode_ag_iterator Genericize xfs_inode_ag_walk() to support an optional radix tree tag and args argument for the execute function. Create a new wrapper called xfs_inode_ag_iterator_tag() that performs a tag based walk of perag's and inodes. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 56 +++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_icache.h | 9 +++++-- fs/xfs/xfs_qm_syscalls.c | 5 ++-- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f9afc5ff0482..2a96dc48ebe6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -516,8 +516,11 @@ xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) { uint32_t first_index; int last_error = 0; @@ -536,9 +539,17 @@ restart: int i; rcu_read_lock(); - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + + if (tag == -1) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + if (!nr_found) { rcu_read_unlock(); break; @@ -579,7 +590,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], pag, flags); + error = execute(batch[i], pag, flags, args); IRELE(batch[i]); if (error == EAGAIN) { skipped++; @@ -608,8 +619,10 @@ int xfs_inode_ag_iterator( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags), - int flags) + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args) { struct xfs_perag *pag; int error = 0; @@ -619,7 +632,36 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } + } + return XFS_ERROR(last_error); +} + +int +xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, + struct xfs_perag *pag, int flags, + void *args), + int flags, + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index db3613075dc6..54c113478dfc 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,7 +40,12 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags); + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args); +int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, + int flags, void *args), + int flags, void *args, int tag); #endif diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 7a9071f8855f..5f53e75409b8 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -846,7 +846,8 @@ STATIC int xfs_dqrele_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int flags, + void *args) { /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || @@ -882,5 +883,5 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); } From 72b53efa4a6125a4c334871c58268c430605819a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:40 -0500 Subject: [PATCH 29/78] xfs: create helper to check whether to free eofblocks on inode This check is used in multiple places to determine whether we should check for (and potentially free) post EOF blocks on an inode. Add a helper to consolidate the check. Note that when we remove an inode from the cache (xfs_inactive()), we are required to trim post-EOF blocks even if the inode is marked preallocated or append-only to maintain correct space accounting. The 'force' parameter to xfs_can_free_eofblocks() specifies whether we should ignore the prealloc/append-only status of the inode. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 37 +++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_vnodeops.c | 19 +++++++------------ 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 965598eb308c..7449cb943efd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3912,3 +3912,40 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VN_CACHED(VFS_I(ip)) == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1fc2065e010b..21b4de3df716 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -585,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *); void xfs_iext_irec_compact_pages(xfs_ifork_t *); void xfs_iext_irec_compact_full(xfs_ifork_t *); void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); +bool xfs_can_free_eofblocks(struct xfs_inode *, bool); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index e6e1d11dfdf2..c4c153900205 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -436,11 +436,7 @@ xfs_release( if (ip->i_d.di_nlink == 0) return 0; - if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { + if (xfs_can_free_eofblocks(ip, false)) { /* * If we can't get the iolock just skip truncating the blocks @@ -516,13 +512,12 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((S_ISREG(ip->i_d.di_mode) && - (VFS_I(ip)->i_size > 0 || - (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - ip->i_delayed_blks != 0))) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) { error = xfs_free_eofblocks(mp, ip, false); if (error) return VN_INACTIVE_CACHE; From 40165e27617e2a98bf8588001d2f2872fae2fee2 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:41 -0500 Subject: [PATCH 30/78] xfs: make xfs_free_eofblocks() non-static, return EAGAIN on trylock failure Turn xfs_free_eofblocks() into a non-static function, return EAGAIN to indicate trylock failure and make sure this error is not propagated in xfs_release(). Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 6 +++--- fs/xfs/xfs_vnodeops.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c4c153900205..c2ddd7a43942 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -151,7 +151,7 @@ xfs_readlink( * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. */ -STATIC int +int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, @@ -200,7 +200,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { xfs_trans_cancel(tp, 0); - return 0; + return EAGAIN; } } @@ -463,7 +463,7 @@ xfs_release( return 0; error = xfs_free_eofblocks(mp, ip, true); - if (error) + if (error && error != EAGAIN) return error; /* delalloc blocks after truncation means it really is dirty */ diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 447e146b2ba6..52fafc416a0c 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -57,5 +57,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); +int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); #endif /* _XFS_VNODEOPS_H */ From 41176a68e3f710630feace536d0277a092e206b5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:42 -0500 Subject: [PATCH 31/78] xfs: create function to scan and clear EOFBLOCKS inodes xfs_inodes_free_eofblocks() implements scanning functionality for EOFBLOCKS inodes. It uses the AG iterator to walk the tagged inodes and free post-EOF blocks via the xfs_inode_free_eofblocks() execute function. The scan can be invoked in best-effort mode or wait (force) mode. A best-effort scan (default) handles all inodes that do not have a dirty cache and we successfully acquire the io lock via trylock. In wait mode, we continue to cycle through an AG until all inodes are handled. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_trace.h | 1 + 3 files changed, 45 insertions(+) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2a96dc48ebe6..d115cb44b103 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1170,6 +1170,49 @@ xfs_reclaim_inodes_count( return reclaimable; } +STATIC int +xfs_inode_free_eofblocks( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags, + void *args) +{ + int ret; + + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty the operation can block and wait for some + * time. Unless we are waiting, skip it. + */ + if (!(flags & SYNC_WAIT) && + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + ret = xfs_free_eofblocks(ip->i_mount, ip, true); + + /* don't revisit the inode if we're not waiting */ + if (ret == EAGAIN && !(flags & SYNC_WAIT)) + ret = 0; + + return ret; +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + int flags) +{ + ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, + NULL, XFS_ICI_EOFBLOCKS_TAG); +} + void xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 54c113478dfc..cb6b8d0eee61 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -37,6 +37,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_eofblocks(struct xfs_mount *, int); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6f46e034b766..cb5234632072 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -589,6 +589,7 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach); DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), From 8ca149de80478441352a8622ea15fae7de703ced Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:12 -0500 Subject: [PATCH 32/78] xfs: add XFS_IOC_FREE_EOFBLOCKS ioctl The XFS_IOC_FREE_EOFBLOCKS ioctl allows users to invoke an EOFBLOCKS scan. The xfs_eofblocks structure is defined to support the command parameters (scan mode). Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 17 +++++++++++++++++ fs/xfs/xfs_icache.c | 10 +++++++--- fs/xfs/xfs_icache.h | 2 +- fs/xfs/xfs_ioctl.c | 20 ++++++++++++++++++++ 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0948c043443b..0cfa30813b16 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -339,6 +339,22 @@ typedef struct xfs_error_injection { } xfs_error_injection_t; +/* + * Speculative preallocation trimming. + */ +#define XFS_EOFBLOCKS_VERSION 1 +struct xfs_eofblocks { + __u32 eof_version; + __u32 eof_flags; + __u64 pad[15]; +}; + +/* eof_flags values */ +#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_VALID \ + (XFS_EOF_FLAGS_SYNC) + + /* * The user-level Handle Request interface structure. */ @@ -457,6 +473,7 @@ typedef struct xfs_handle { /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d115cb44b103..fbb74c715266 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1206,11 +1206,15 @@ xfs_inode_free_eofblocks( int xfs_icache_free_eofblocks( struct xfs_mount *mp, - int flags) + struct xfs_eofblocks *eofb) { - ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); + int flags = SYNC_TRYLOCK; + + if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) + flags = SYNC_WAIT; + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, - NULL, XFS_ICI_EOFBLOCKS_TAG); + eofb, XFS_ICI_EOFBLOCKS_TAG); } void diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index cb6b8d0eee61..4934a77024cf 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -37,7 +37,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_eofblocks(struct xfs_mount *, int); +int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c1df3c623de2..5b20ab0b4f9d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -42,6 +42,7 @@ #include "xfs_inode_item.h" #include "xfs_export.h" #include "xfs_trace.h" +#include "xfs_icache.h" #include #include @@ -1602,6 +1603,25 @@ xfs_file_ioctl( error = xfs_errortag_clearall(mp, 1); return -error; + case XFS_IOC_FREE_EOFBLOCKS: { + struct xfs_eofblocks eofb; + + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -XFS_ERROR(EFAULT); + + if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) + return -XFS_ERROR(EINVAL); + + if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) + return -XFS_ERROR(EINVAL); + + if (memchr_inv(eofb.pad, 0, sizeof(eofb.pad))) + return -XFS_ERROR(EINVAL); + + error = xfs_icache_free_eofblocks(mp, &eofb); + return -error; + } + default: return -ENOTTY; } From 3e3f9f5863548e870edfcc72e7617ac8ddcad44a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:13 -0500 Subject: [PATCH 33/78] xfs: add inode id filtering to eofblocks scan Support inode ID filtering in the eofblocks scan. The caller must set the associated XFS_EOF_FLAGS_*ID bit and ID field. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 14 ++++++++++++-- fs/xfs/xfs_icache.c | 22 ++++++++++++++++++++++ fs/xfs/xfs_ioctl.c | 3 ++- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0cfa30813b16..a19f9b205c15 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -346,13 +346,23 @@ typedef struct xfs_error_injection { struct xfs_eofblocks { __u32 eof_version; __u32 eof_flags; - __u64 pad[15]; + uid_t eof_uid; + gid_t eof_gid; + prid_t eof_prid; + __u32 pad32; + __u64 pad64[13]; }; /* eof_flags values */ #define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ +#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ +#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ #define XFS_EOF_FLAGS_VALID \ - (XFS_EOF_FLAGS_SYNC) + (XFS_EOF_FLAGS_SYNC | \ + XFS_EOF_FLAGS_UID | \ + XFS_EOF_FLAGS_GID | \ + XFS_EOF_FLAGS_PRID) /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fbb74c715266..b239da91c43b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1170,6 +1170,21 @@ xfs_reclaim_inodes_count( return reclaimable; } +STATIC int +xfs_inode_match_id( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if (eofb->eof_flags & XFS_EOF_FLAGS_UID) + return ip->i_d.di_uid == eofb->eof_uid; + else if (eofb->eof_flags & XFS_EOF_FLAGS_GID) + return ip->i_d.di_gid == eofb->eof_gid; + else if (eofb->eof_flags & XFS_EOF_FLAGS_PRID) + return xfs_get_projid(ip) == eofb->eof_prid; + + return 0; +} + STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, @@ -1178,6 +1193,7 @@ xfs_inode_free_eofblocks( void *args) { int ret; + struct xfs_eofblocks *eofb = args; if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ @@ -1194,6 +1210,12 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; + if (eofb && + (eofb->eof_flags & (XFS_EOF_FLAGS_UID|XFS_EOF_FLAGS_GID| + XFS_EOF_FLAGS_PRID)) && + !xfs_inode_match_id(ip, eofb)) + return 0; + ret = xfs_free_eofblocks(ip->i_mount, ip, true); /* don't revisit the inode if we're not waiting */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 5b20ab0b4f9d..c1c3ef88a260 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1615,7 +1615,8 @@ xfs_file_ioctl( if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) return -XFS_ERROR(EINVAL); - if (memchr_inv(eofb.pad, 0, sizeof(eofb.pad))) + if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || + memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) return -XFS_ERROR(EINVAL); error = xfs_icache_free_eofblocks(mp, &eofb); From 1b5560488d1ab7c932f6f99385b41116838c3486 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:45 -0500 Subject: [PATCH 34/78] xfs: support multiple inode id filtering in eofblocks scan Enhance the eofblocks scan code to filter based on multiply specified inode id values. When multiple inode id values are specified, only inodes that match all id values are selected. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_icache.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index b239da91c43b..32908909815e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1175,14 +1175,19 @@ xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UID) - return ip->i_d.di_uid == eofb->eof_uid; - else if (eofb->eof_flags & XFS_EOF_FLAGS_GID) - return ip->i_d.di_gid == eofb->eof_gid; - else if (eofb->eof_flags & XFS_EOF_FLAGS_PRID) - return xfs_get_projid(ip) == eofb->eof_prid; + if (eofb->eof_flags & XFS_EOF_FLAGS_UID && + ip->i_d.di_uid != eofb->eof_uid) + return 0; - return 0; + if (eofb->eof_flags & XFS_EOF_FLAGS_GID && + ip->i_d.di_gid != eofb->eof_gid) + return 0; + + if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && + xfs_get_projid(ip) != eofb->eof_prid) + return 0; + + return 1; } STATIC int @@ -1210,10 +1215,7 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb && - (eofb->eof_flags & (XFS_EOF_FLAGS_UID|XFS_EOF_FLAGS_GID| - XFS_EOF_FLAGS_PRID)) && - !xfs_inode_match_id(ip, eofb)) + if (eofb && !xfs_inode_match_id(ip, eofb)) return 0; ret = xfs_free_eofblocks(ip->i_mount, ip, true); From 00ca79a04bef1a1b30ef8afd992d905b6d986caf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 7 Nov 2012 12:21:14 -0500 Subject: [PATCH 35/78] xfs: add minimum file size filtering to eofblocks scan Support minimum file size filtering in the eofblocks scan. The caller must set the XFS_EOF_FLAGS_MINFILESIZE flags bit and minimum file size value in bytes. Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_fs.h | 7 +++++-- fs/xfs/xfs_icache.c | 11 +++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index a19f9b205c15..6dda3f949b04 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -350,7 +350,8 @@ struct xfs_eofblocks { gid_t eof_gid; prid_t eof_prid; __u32 pad32; - __u64 pad64[13]; + __u64 eof_min_file_size; + __u64 pad64[12]; }; /* eof_flags values */ @@ -358,11 +359,13 @@ struct xfs_eofblocks { #define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ #define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ #define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ +#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ #define XFS_EOF_FLAGS_VALID \ (XFS_EOF_FLAGS_SYNC | \ XFS_EOF_FLAGS_UID | \ XFS_EOF_FLAGS_GID | \ - XFS_EOF_FLAGS_PRID) + XFS_EOF_FLAGS_PRID | \ + XFS_EOF_FLAGS_MINFILESIZE) /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 32908909815e..906e6dcd2c55 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1215,8 +1215,15 @@ xfs_inode_free_eofblocks( mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb && !xfs_inode_match_id(ip, eofb)) - return 0; + if (eofb) { + if (!xfs_inode_match_id(ip, eofb)) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + } ret = xfs_free_eofblocks(ip->i_mount, ip, true); From 579b62faa5fb16ffeeb88cda5e2c4e95730881af Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 Nov 2012 09:50:47 -0500 Subject: [PATCH 36/78] xfs: add background scanning to clear eofblocks inodes Create a new mount workqueue and delayed_work to enable background scanning and freeing of eofblocks inodes. The scanner kicks in once speculative preallocation occurs and stops requeueing itself when no eofblocks inodes exist. The scan interval is based on the new 'speculative_prealloc_lifetime' tunable (default to 5m). The background scanner performs unfiltered, best effort scans (which skips inodes under lock contention or with a dirty cache mapping). Signed-off-by: Brian Foster Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_globals.c | 4 +++- fs/xfs/xfs_icache.c | 29 +++++++++++++++++++++++++++++ fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_mount.c | 2 ++ fs/xfs/xfs_mount.h | 3 +++ fs/xfs/xfs_super.c | 9 +++++++++ fs/xfs/xfs_sysctl.c | 9 +++++++++ fs/xfs/xfs_sysctl.h | 1 + 9 files changed, 58 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 76e81cff70b9..5399ef222dd7 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,7 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second). + * 100ths of a second) with the exception of eofb_timer, which is measured in + * seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -40,4 +41,5 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, + .eofb_timer = { 1, 300, 3600*24}, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 906e6dcd2c55..96e344e3e927 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -615,6 +615,32 @@ restart: return last_error; } +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'background_prealloc_discard_period' tunable (5m by default). + */ +STATIC void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + xfs_icache_free_eofblocks(mp, NULL); + xfs_queue_eofblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -1273,6 +1299,9 @@ xfs_inode_set_eofblocks_tag( XFS_ICI_EOFBLOCKS_TAG); spin_unlock(&ip->i_mount->m_perag_lock); + /* kick off background trimming */ + xfs_queue_eofblocks(ip->i_mount); + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 4934a77024cf..e0f138c70a2f 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -38,6 +38,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +void xfs_eofblocks_worker(struct work_struct *); int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 828662f70d64..0a134ca5211c 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -118,6 +118,7 @@ #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val +#define xfs_eofb_secs xfs_params.eofb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6f1c997704cd..41ae7e1590f5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1428,6 +1428,8 @@ xfs_unmountfs( __uint64_t resblks; int error; + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a631ca3b9065..dc306a09f56f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -196,6 +196,8 @@ typedef struct xfs_mount { #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct delayed_work m_eofblocks_work; /* background eof blocks + trimming */ __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */ @@ -207,6 +209,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3d9ea947e9f8..ab8839b26272 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -874,8 +874,15 @@ xfs_init_mount_workqueues( if (!mp->m_log_workqueue) goto out_destroy_reclaim; + mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + WQ_NON_REENTRANT, 0, mp->m_fsname); + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + return 0; +out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -892,6 +899,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_eofblocks_workqueue); destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); @@ -1393,6 +1401,7 @@ xfs_fs_fill_super( mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); mp->m_super = sb; sb->s_fs_info = mp; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index ee2d2adaa438..2801b5ce6cdb 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -202,6 +202,15 @@ static ctl_table xfs_table[] = { .extra1 = &xfs_params.fstrm_timer.min, .extra2 = &xfs_params.fstrm_timer.max, }, + { + .procname = "speculative_prealloc_lifetime", + .data = &xfs_params.eofb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.eofb_timer.min, + .extra2 = &xfs_params.eofb_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index b9937d450f8e..bd8e157c20ef 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -47,6 +47,7 @@ typedef struct xfs_param { xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ + xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ } xfs_param_t; /* From 07428d7f0ca46087f7f1efa895322bb9dc1ac21d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:44 +1100 Subject: [PATCH 37/78] xfs: fix attr tree double split corruption In certain circumstances, a double split of an attribute tree is needed to insert or replace an attribute. In rare situations, this can go wrong, leaving the attribute tree corrupted. In this case, the attr being replaced is the last attr in a leaf node, and the replacement is larger so doesn't fit in the same leaf node. When we have the initial condition of a node format attribute btree with two leaves at index 1 and 2. Call them L1 and L2. The leaf L1 is completely full, there is not a single byte of free space in it. L2 is mostly empty. The attribute being replaced - call it X - is the last attribute in L1. The way an attribute replace is executed is that the replacement attribute - call it Y - is first inserted into the tree, but has an INCOMPLETE flag set on it so that list traversals ignore it. Once this transaction is committed, a second transaction it run to atomically mark Y as COMPLETE and X as INCOMPLETE, so that a traversal will now find Y and skip X. Once that transaction is committed, attribute X is then removed. So, the initial condition is: +--------+ +--------+ | L1 | | L2 | | fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 | | fsp: 0 | | fsp: N | |--------| |--------| | attr A | | attr 1 | |--------| |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr X | | attr n | +--------+ +--------+ So now we go to replace X, and see that L1:fsp = 0 - it is full so we can't insert Y in the same leaf. So we record the the location of attribute X so we can track it for later use, then we split L1 into L1 and L3 and reblance across the two leafs. We end with: +--------+ +--------+ +--------+ | L1 | | L3 | | L2 | | fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: N | |--------| |--------| |--------| | attr A | | attr X | | attr 1 | |--------| +--------+ |--------| | attr B | | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And we track that the original attribute is now at L3:0. We then try to insert Y into L1 again, and find that there isn't enough room because the new attribute is larger than the old one. Hence we have to split again to make room for Y. We end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| + INCOMP + +--------+ |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ And now we have the new (incomplete) attribute @ L4:0, and the original attribute at L3:0. At this point, the first transaction is committed, and we move to the flipping of the flags. This is where we are supposed to end up with this: +--------+ +--------+ +--------+ +--------+ | L1 | | L4 | | L3 | | L2 | | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 | | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 | | fsp: M | | fsp: J | | fsp: J | | fsp: N | |--------| |--------| |--------| |--------| | attr A | | attr Y | | attr X | | attr 1 | |--------| +--------+ + INCOMP + |--------| | attr B | +--------+ | attr 2 | |--------| |--------| .......... .......... |--------| |--------| | attr W | | attr n | +--------+ +--------+ But that doesn't happen properly - the attribute tracking indexes are not pointing to the right locations. What we end up with is both the old attribute to be removed pointing at L4:0 and the new attribute at L4:1. On a debug kernel, this assert fails like so: XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725 because the new attribute location does not exist. On a production kernel, this goes unnoticed and the code proceeds ahead merrily and removes L4 because it thinks that is the block that is no longer needed. This leaves the hash index node pointing to entries L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free space, and so everything is busted. This corruption is caused by the removal of the old attribute triggering a join - it joins everything correctly but then frees the wrong block. xfs_repair will report something like: bad sibling back pointer for block 4 in attribute fork for inode 131 problem with attribute contents in inode 131 would clear attr fork bad nblocks 8 for inode 131, would reset to 3 bad anextents 4 for inode 131, would reset to 0 The problem lies in the assignment of the old/new blocks for tracking purposes when the double leaf split occurs. The first split tries to place the new attribute inside the current leaf (i.e. "inleaf == true") and moves the old attribute (X) to the new block. This sets up the old block/index to L1:X, and newly allocated block to L3:0. It then moves attr X to the new block and tries to insert attr Y at the old index. That fails, so it splits again. With the second split, the rebalance ends up placing the new attr in the second new block - L4:0 - and this is where the code goes wrong. What is does is it sets both the new and old block index to the second new block. Hence it inserts attr Y at the right place (L4:0) but overwrites the current location of the attr to replace that is held in the new block index (currently L3:0). It over writes it with L4:1 - the index we later assert fail on. Hopefully this table will show this in a foramt that is a bit easier to understand: Split old attr index new attr index vanilla patched vanilla patched before 1st L1:26 L1:26 N/A N/A after 1st L3:0 L3:0 L1:26 L1:26 after 2nd L4:0 L3:0 L4:1 L4:0 ^^^^ ^^^^ wrong wrong The fix is surprisingly simple, for all this analysis - just stop the rebalance on the out-of leaf case from overwriting the new attr index - it's already correct for the double split case. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_attr_leaf.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d330111ca738..70eec1829776 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1291,6 +1291,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, leaf2 = blk2->bp->b_addr; ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + ASSERT(leaf2->hdr.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); @@ -1361,6 +1362,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ + ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. @@ -1422,10 +1424,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, args->index2 = 0; args->blkno2 = blk2->blkno; } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - args->index = args->index2 = blk2->index; - args->blkno = args->blkno2 = blk2->blkno; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } } } else { ASSERT(state->inleaf == 1); From 7bf7f352194252e6f05981d44fb8cb55668606cd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:45 +1100 Subject: [PATCH 38/78] xfs: fix broken error handling in xfs_vm_writepage When we shut down the filesystem, it might first be detected in writeback when we are allocating a inode size transaction. This happens after we have moved all the pages into the writeback state and unlocked them. Unfortunately, if we fail to set up the transaction we then abort writeback and try to invalidate the current page. This then triggers are BUG() in block_invalidatepage() because we are trying to invalidate an unlocked page. Fixing this is a bit of a chicken and egg problem - we can't allocate the transaction until we've clustered all the pages into the IO and we know the size of it (i.e. whether the last block of the IO is beyond the current EOF or not). However, we don't want to hold pages locked for long periods of time, especially while we lock other pages to cluster them into the write. To fix this, we need to make a clear delineation in writeback where errors can only be handled by IO completion processing. That is, once we have marked a page for writeback and unlocked it, we have to report errors via IO completion because we've already started the IO. We may not have submitted any IO, but we've changed the page state to indicate that it is under IO so we must now use the IO completion path to report errors. To do this, add an error field to xfs_submit_ioend() to pass it the error that occurred during the building on the ioend chain. When this is non-zero, mark each ioend with the error and call xfs_finish_ioend() directly rather than building bios. This will immediately push the ioends through completion processing with the error that has occurred. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 54 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e562dd43f41f..e57e2daa357c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -481,11 +481,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend) + xfs_ioend_t *ioend, + int fail) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; @@ -506,6 +512,18 @@ xfs_submit_ioend( next = ioend->io_list; bio = NULL; + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { @@ -1060,7 +1078,18 @@ xfs_vm_writepage( xfs_start_page_writeback(page, 1, count); - if (ioend && imap_valid) { + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; @@ -1079,20 +1108,15 @@ xfs_vm_writepage( wbc, end_index); } - if (iohead) { - /* - * Reserve log space if we might write beyond the on-disk - * inode size. - */ - if (ioend->io_type != XFS_IO_UNWRITTEN && - xfs_ioend_is_append(ioend)) { - err = xfs_setfilesize_trans_alloc(ioend); - if (err) - goto error; - } - xfs_submit_ioend(wbc, iohead); - } + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); return 0; From 37eb17e604ac7398bbb133c82f281475d704fff7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:09:46 +1100 Subject: [PATCH 39/78] xfs: drop buffer io reference when a bad bio is built Error handling in xfs_buf_ioapply_map() does not handle IO reference counts correctly. We increment the b_io_remaining count before building the bio, but then fail to decrement it in the failure case. This leads to the buffer never running IO completion and releasing the reference that the IO holds, so at unmount we can leak the buffer. This leak is captured by this assert failure during unmount: XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/xfs_mount.c, line: 273 This is not a new bug - the b_io_remaining accounting has had this problem for a long, long time - it's just very hard to get a zero length bio being built by this code... Further, the buffer IO error can be overwritten on a multi-segment buffer by subsequent bio completions for partial sections of the buffer. Hence we should only set the buffer error status if the buffer is not already carrying an error status. This ensures that a partial IO error on a multi-segment buffer will not be lost. This part of the problem is a regression, however. cc: Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 933b7930b863..4b0b8dd1b7b0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1197,9 +1197,14 @@ xfs_buf_bio_end_io( { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - xfs_buf_ioerror(bp, -error); + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); - if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); _xfs_buf_ioend(bp, 1); @@ -1279,6 +1284,11 @@ next_chunk: if (size) goto next_chunk; } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, EIO); bio_put(bio); } From ee73259b401317117e7f5d4834c270b10b12bc8e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:53 +1100 Subject: [PATCH 40/78] xfs: add more attribute tree trace points. Added when debugging recent attribute tree problems to more finely trace code execution through the maze of twisty passages that makes up the attr code. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 18 ++++++++++++++ fs/xfs/xfs_attr_leaf.c | 37 +++++++++++++++++------------ fs/xfs/xfs_da_btree.c | 6 +++++ fs/xfs/xfs_trace.h | 54 +++++++++++++++++++++++++++++++++++++++++- 4 files changed, 99 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 0ca1f0be62d2..55bbe98e8f82 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1155,6 +1155,8 @@ xfs_attr_leaf_get(xfs_da_args_t *args) struct xfs_buf *bp; int error; + trace_xfs_attr_leaf_get(args); + args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, XFS_ATTR_FORK); @@ -1185,6 +1187,8 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) int error; struct xfs_buf *bp; + trace_xfs_attr_leaf_list(context); + context->cursor->blkno = 0; error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) @@ -1653,6 +1657,8 @@ xfs_attr_fillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level; + trace_xfs_attr_fillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1699,6 +1705,8 @@ xfs_attr_refillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level, error; + trace_xfs_attr_refillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1755,6 +1763,8 @@ xfs_attr_node_get(xfs_da_args_t *args) int error, retval; int i; + trace_xfs_attr_node_get(args); + state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; @@ -1804,6 +1814,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) int error, i; struct xfs_buf *bp; + trace_xfs_attr_node_list(context); + cursor = context->cursor; cursor->initted = 1; @@ -1959,6 +1971,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) int nmap, error, tmp, valuelen, blkcnt, i; xfs_dablk_t lblkno; + trace_xfs_attr_rmtval_get(args); + ASSERT(!(args->flags & ATTR_KERNOVAL)); mp = args->dp->i_mount; @@ -2014,6 +2028,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) xfs_dablk_t lblkno; int blkcnt, valuelen, nmap, error, tmp, committed; + trace_xfs_attr_rmtval_set(args); + dp = args->dp; mp = dp->i_mount; src = args->value; @@ -2143,6 +2159,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) xfs_dablk_t lblkno; int valuelen, blkcnt, nmap, error, done, committed; + trace_xfs_attr_rmtval_remove(args); + mp = args->dp->i_mount; /* diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 70eec1829776..4bfc732bc9c9 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, struct xfs_buf **bpp); STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, xfs_da_args_t *args, int freemap_index); -STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); +STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args, + struct xfs_buf *leaf_buffer); STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); @@ -1071,7 +1072,7 @@ xfs_attr_leaf_add( * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args->trans, bp); + xfs_attr_leaf_compact(args, bp); /* * After compaction, the block is guaranteed to have only one @@ -1102,6 +1103,8 @@ xfs_attr_leaf_add_work( xfs_mount_t *mp; int tmp, i; + trace_xfs_attr_leaf_add_work(args); + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; @@ -1214,15 +1217,17 @@ xfs_attr_leaf_add_work( */ STATIC void xfs_attr_leaf_compact( - struct xfs_trans *trans, - struct xfs_buf *bp) + struct xfs_da_args *args, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_mount_t *mp; - char *tmpbuffer; + xfs_attr_leafblock_t *leaf_s, *leaf_d; + xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; + struct xfs_trans *trans = args->trans; + struct xfs_mount *mp = trans->t_mountp; + char *tmpbuffer; + + trace_xfs_attr_leaf_compact(args); - mp = trans->t_mountp; tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); @@ -1345,9 +1350,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr2->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk2->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. @@ -1378,9 +1382,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, max = be16_to_cpu(hdr1->firstused) - sizeof(xfs_attr_leaf_hdr_t); max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk1->bp); - } + if (space > max) + xfs_attr_leaf_compact(args, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. @@ -1577,6 +1580,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) xfs_dablk_t blkno; struct xfs_buf *bp; + trace_xfs_attr_leaf_toosmall(state->args); + /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able @@ -1702,6 +1707,8 @@ xfs_attr_leaf_remove( int tablesize, tmp, i; xfs_mount_t *mp; + trace_xfs_attr_leaf_remove(args); + leaf = bp->b_addr; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 7bfb7dd334fc..c62e7e6ff50e 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -779,6 +779,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) xfs_dablk_t blkno; struct xfs_buf *bp; + trace_xfs_da_node_toosmall(state->args); + /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able @@ -900,6 +902,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) xfs_dahash_t lasthash=0; int level, count; + trace_xfs_da_fixhashpath(state->args); + level = path->active-1; blk = &path->blk[ level ]; switch (blk->magic) { @@ -1417,6 +1421,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, xfs_dablk_t blkno=0; int level, error; + trace_xfs_da_path_shift(state->args); + /* * Roll up the Btree looking for the first block where our * current index is not at the edge of the block. Note that diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index cb5234632072..2e137d4a85ae 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); +DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); DECLARE_EVENT_CLASS(xfs_perag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, @@ -1502,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace); DEFINE_DIR2_EVENT(xfs_dir2_node_removename); DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); +DECLARE_EVENT_CLASS(xfs_attr_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(int, valuelen) + __field(xfs_dahash_t, hashval) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->valuelen = args->valuelen; + __entry->hashval = args->hashval; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " + "hashval 0x%x op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->valuelen, + __entry->hashval, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + #define DEFINE_ATTR_EVENT(name) \ -DEFINE_EVENT(xfs_da_class, name, \ +DEFINE_EVENT(xfs_attr_class, name, \ TP_PROTO(struct xfs_da_args *args), \ TP_ARGS(args)) DEFINE_ATTR_EVENT(xfs_attr_sf_add); @@ -1517,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); DEFINE_ATTR_EVENT(xfs_attr_leaf_add); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); +DEFINE_ATTR_EVENT(xfs_attr_leaf_get); DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_remove); DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); DEFINE_ATTR_EVENT(xfs_attr_leaf_split); DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); @@ -1532,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_get); DEFINE_ATTR_EVENT(xfs_attr_node_lookup); DEFINE_ATTR_EVENT(xfs_attr_node_replace); DEFINE_ATTR_EVENT(xfs_attr_node_removename); +DEFINE_ATTR_EVENT(xfs_attr_fillstate); +DEFINE_ATTR_EVENT(xfs_attr_refillstate); + +DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); + #define DEFINE_DA_EVENT(name) \ DEFINE_EVENT(xfs_da_class, name, \ TP_PROTO(struct xfs_da_args *args), \ @@ -1556,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split); DEFINE_DA_EVENT(xfs_da_node_remove); DEFINE_DA_EVENT(xfs_da_node_rebalance); DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_node_toosmall); DEFINE_DA_EVENT(xfs_da_swap_lastblock); DEFINE_DA_EVENT(xfs_da_grow_inode); DEFINE_DA_EVENT(xfs_da_shrink_inode); +DEFINE_DA_EVENT(xfs_da_fixhashpath); +DEFINE_DA_EVENT(xfs_da_path_shift); DECLARE_EVENT_CLASS(xfs_dir2_space_class, TP_PROTO(struct xfs_da_args *args, int idx), From b64f3a390d3477517cbff7d613e551705540769b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 13 Nov 2012 16:40:27 -0600 Subject: [PATCH 41/78] xfs: use btree block initialisation functions in growfs Factor xfs_btree_init_block() to be independent of the btree cursor, and use the function to initialise btree blocks in the growfs code. This makes adding support for different format btree blocks simple. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_btree.c | 33 ++++++++++++++++++++++++--------- fs/xfs/xfs_btree.h | 11 +++++++++++ fs/xfs/xfs_fsops.c | 37 +++++++++++++------------------------ 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e53e317b1582..121ea99e615a 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -853,18 +853,22 @@ xfs_btree_set_sibling( } } -STATIC void +void xfs_btree_init_block( - struct xfs_btree_cur *cur, - int level, - int numrecs, - struct xfs_btree_block *new) /* new block */ + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags) { - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); + struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp); + + new->bb_magic = cpu_to_be32(magic); new->bb_level = cpu_to_be16(level); new->bb_numrecs = cpu_to_be16(numrecs); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (flags & XFS_BTREE_LONG_PTRS) { new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); } else { @@ -873,6 +877,17 @@ xfs_btree_init_block( } } +STATIC void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + int level, + int numrecs, + struct xfs_buf *bp) +{ + xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum], + level, numrecs, cur->bc_flags); +} + /* * Return true if ptr is the last record in the btree and * we need to track updateÑ• to this record. The decision @@ -2183,7 +2198,7 @@ xfs_btree_split( goto error0; /* Fill in the btree header for the new right block. */ - xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); + xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp); /* * Split the entries between the old and the new block evenly. @@ -2492,7 +2507,7 @@ xfs_btree_new_root( nptr = 2; } /* Fill in the new block's btree header and log it. */ - xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); + xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 5b240de104c0..c9cf2d00e236 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -378,6 +378,17 @@ xfs_btree_reada_bufs( xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count); /* count of filesystem blocks */ +/* + * Initialise a new btree block header + */ +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + unsigned int flags); /* * Common btree core entry points. diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7b0a997cf62b..a5034af35db7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -125,7 +125,6 @@ xfs_growfs_data_private( xfs_extlen_t agsize; xfs_extlen_t tmpsize; xfs_alloc_rec_t *arec; - struct xfs_btree_block *block; xfs_buf_t *bp; int bucket; int dpct; @@ -263,17 +262,14 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -289,18 +285,15 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_ALLOC_REC_ADDR(mp, block, 1); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -316,13 +309,9 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } - block = XFS_BUF_TO_BLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); - block->bb_level = 0; - block->bb_numrecs = 0; - block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) From fd23683c3b1ab905cba61ea2981c156f4bf52845 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:59 +1100 Subject: [PATCH 42/78] xfs: growfs: use uncached buffers for new headers When writing the new AG headers to disk, we can't attach write verifiers because they have a dependency on the struct xfs-perag being attached to the buffer to be fully initialised and growfs can't fully initialise them until later in the process. The simplest way to avoid this problem is to use uncached buffers for writing the new headers. These buffers don't have the xfs-perag attached to them, so it's simple to detect in the write verifier and be able to skip the checks that need the xfs-perag. This enables us to attach the appropriate buffer ops to the buffer and hence calculate CRCs on the way to disk. IT also means that the buffer is torn down immediately, and so the first access to the AG headers will re-read the header from disk and perform full verification of the buffer. This way we also can catch corruptions due to problems that went undetected in growfs. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 63 ++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a5034af35db7..2196830bf5c0 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -114,6 +114,26 @@ xfs_fs_geometry( return 0; } +static struct xfs_buf * +xfs_growfs_get_hdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + + return bp; +} + static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ @@ -189,15 +209,15 @@ xfs_growfs_data_private( /* * AG freelist header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; } + agf = XFS_BUF_TO_AGF(bp); - memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(agno); @@ -226,15 +246,15 @@ xfs_growfs_data_private( /* * AG inode header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; } + agi = XFS_BUF_TO_AGI(bp); - memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(agno); @@ -255,16 +275,16 @@ xfs_growfs_data_private( /* * BNO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); + if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -278,16 +298,15 @@ xfs_growfs_data_private( /* * CNT btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( @@ -302,14 +321,14 @@ xfs_growfs_data_private( /* * INO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; } - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0); error = xfs_bwrite(bp); From de497688daaabbab425a8a969528272ec1d962a6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:00 +1100 Subject: [PATCH 43/78] xfs: make growfs initialise the AGFL header For verification purposes, AGFLs need to be initialised to a known set of values. For upcoming CRC changes, they are also headers that need to be initialised. Currently, growfs does neither for the AGFLs - it ignores them completely. Add initialisation of the AGFL to be full of invalid block numbers (NULLAGBLOCK) to put the infrastructure in place needed for CRC support. Includes a comment clarification from Jeff Liu. Signed-off-by: Dave Chinner Reviewed-by Rich Johnston Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 2196830bf5c0..bd9cb7f0b073 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -140,6 +140,7 @@ xfs_growfs_data_private( xfs_growfs_data_t *in) /* growfs data input struct */ { xfs_agf_t *agf; + struct xfs_agfl *agfl; xfs_agi_t *agi; xfs_agnumber_t agno; xfs_extlen_t agsize; @@ -207,7 +208,7 @@ xfs_growfs_data_private( nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { /* - * AG freelist header block + * AG freespace header block */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), @@ -243,6 +244,26 @@ xfs_growfs_data_private( if (error) goto error0; + /* + * AG freelist header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (!bp) { + error = ENOMEM; + goto error0; + } + + agfl = XFS_BUF_TO_AGFL(bp); + for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + /* * AG inode header block */ From f5b8911b67eb4f15d95d5e5324d376d4a49d56e8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:42:47 +1100 Subject: [PATCH 44/78] xfs: remove xfs_tosspages It's a buggy, unnecessary wrapper that is duplicating truncate_pagecache_range(). When replacing the call in xfs_change_file_space(), also ensure that the length being allocated/freed is always positive before making any changes. These checks are done in the lower extent manipulation functions, too, but we need to do them before any page cache operations. Reported-by: Andrew Dahl Signed-off-by: Dave Chinner Reviewed-By: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_dfrag.c | 3 +-- fs/xfs/xfs_fs_subr.c | 12 ------------ fs/xfs/xfs_vnodeops.c | 30 +++++++++++++++++++++++++----- fs/xfs/xfs_vnodeops.h | 2 -- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b9b8646e62db..b2c63a28afa7 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -315,8 +315,7 @@ xfs_swap_extents( * are safe. We don't really care if non-io related * fields change. */ - - xfs_tosspages(ip, 0, -1, FI_REMAPF); + truncate_pagecache_range(VFS_I(ip), 0, -1); tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); if ((error = xfs_trans_reserve(tp, 0, diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index 652b875a9d4c..d49de3d70456 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -25,18 +25,6 @@ * note: all filemap functions return negative error codes. These * need to be inverted before returning to the xfs core functions. */ -void -xfs_tosspages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - /* can't toss partial tail pages, so mask them out */ - last &= ~(PAGE_SIZE - 1); - truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1); -} - int xfs_flushinval_pages( xfs_inode_t *ip, diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c2ddd7a43942..de3702a57e55 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2118,7 +2118,7 @@ xfs_change_file_space( xfs_fsize_t fsize; int setprealloc; xfs_off_t startoffset; - xfs_off_t llen; + xfs_off_t end; xfs_trans_t *tp; struct iattr iattr; int prealloc_type; @@ -2139,12 +2139,30 @@ xfs_change_file_space( return XFS_ERROR(EINVAL); } - llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) + return XFS_ERROR(EINVAL); + break; + default: + bf->l_len = 0; + break; + } if (bf->l_start < 0 || bf->l_start > mp->m_super->s_maxbytes || - bf->l_start + llen < 0 || - bf->l_start + llen > mp->m_super->s_maxbytes) + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) return XFS_ERROR(EINVAL); bf->l_whence = 0; @@ -2169,7 +2187,9 @@ xfs_change_file_space( switch (cmd) { case XFS_IOC_ZERO_RANGE: prealloc_type |= XFS_BMAPI_CONVERT; - xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); + end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; + if (startoffset > end) + truncate_pagecache_range(VFS_I(ip), startoffset, end); /* FALLTHRU */ case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 52fafc416a0c..d48141d6bc3b 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -48,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, - xfs_off_t last, int fiopt); int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, From d6638ae244f6323fcdf85e72eb4a5af6f6212893 Mon Sep 17 00:00:00 2001 From: Andrew Dahl Date: Wed, 14 Nov 2012 12:52:26 -0600 Subject: [PATCH 45/78] xfs: reverse the check on XFS_IOC_ZERO_RANGE Reversing the check on XFS_IOC_ZERO_RANGE. Range should be zeroed if the start is less than or equal to the end. Signed-off-by: Andrew Dahl Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_vnodeops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index de3702a57e55..46a7a5de5d6d 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2188,7 +2188,7 @@ xfs_change_file_space( case XFS_IOC_ZERO_RANGE: prealloc_type |= XFS_BMAPI_CONVERT; end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; - if (startoffset > end) + if (startoffset <= end) truncate_pagecache_range(VFS_I(ip), startoffset, end); /* FALLTHRU */ case XFS_IOC_RESVSP: From 95eacf0f71b7682a05b8242c49c68e8e4bb673e3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:55 +1100 Subject: [PATCH 46/78] xfs: remove xfs_wait_on_pages() It's just a simple wrapper around a VFS function that is only called by another function in xfs_fs_subr.c. Remove it and call the VFS function directly. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_fs_subr.c | 18 ++---------------- fs/xfs/xfs_vnodeops.h | 1 - 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index d49de3d70456..33658234dfc5 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -62,23 +62,9 @@ xfs_flush_pages( last == -1 ? LLONG_MAX : last); if (flags & XBF_ASYNC) return ret; - ret2 = xfs_wait_on_pages(ip, first, last); + ret2 = -filemap_fdatawait_range(mapping, first, + last == -1 ? XFS_ISIZE(ip) - 1 : last); if (!ret) ret = ret2; return ret; } - -int -xfs_wait_on_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - - if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { - return -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); - } - return 0; -} diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index d48141d6bc3b..c8ad48b61a25 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -52,7 +52,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt); -int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); From 4bc1ea6b8ddd4f2bd78944fbe5a1042ac14b1f5f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:56 +1100 Subject: [PATCH 47/78] xfs: remove xfs_flush_pages It is a complex wrapper around VFS functions, but there are VFS functions that provide exactly the same functionality. Call the VFS functions directly and remove the unnecessary indirection and complexity. We don't need to care about clearing the XFS_ITRUNCATED flag, as that is done during .writepages. Hence is cleared by the VFS writeback path if there is anything to write back during the flush. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_bmap.c | 2 +- fs/xfs/xfs_fs_subr.c | 24 ------------------------ fs/xfs/xfs_iops.c | 4 ++-- fs/xfs/xfs_vnodeops.c | 7 +++++-- fs/xfs/xfs_vnodeops.h | 2 -- 6 files changed, 9 insertions(+), 32 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e57e2daa357c..71361da1f77c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1641,7 +1641,7 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); - xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); + filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 83d0cf3df930..a60f3d1f151c 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5599,7 +5599,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { - error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; } diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index 33658234dfc5..b5380893728e 100644 --- a/fs/xfs/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c @@ -44,27 +44,3 @@ xfs_flushinval_pages( truncate_inode_pages_range(mapping, first, last); return -ret; } - -int -xfs_flush_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - uint64_t flags, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - int ret2; - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = -filemap_fdatawrite_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (flags & XBF_ASYNC) - return ret; - ret2 = -filemap_fdatawait_range(mapping, first, - last == -1 ? XFS_ISIZE(ip) - 1 : last); - if (!ret) - ret = ret2; - return ret; -} diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 81f5c4953287..d82efaa2ac73 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -780,8 +780,8 @@ xfs_setattr_size( * care about here. */ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { - error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, - FI_NONE); + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 46a7a5de5d6d..c00326afa7bf 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -428,8 +428,11 @@ xfs_release( truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); if (truncated) { xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); - if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) - xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { + error = -filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } } } diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index c8ad48b61a25..73cb3cb15f75 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -50,8 +50,6 @@ int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); -int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, - xfs_off_t last, uint64_t flags, int fiopt); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); From fb59581404ab7ec5075299065c22cb211a9262a9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:53:57 +1100 Subject: [PATCH 48/78] xfs: remove xfs_flushinval_pages It's just a simple wrapper around VFS functionality, and is actually bugging in that it doesn't remove mappings before invalidating the page cache. Remove it and replace it with the correct VFS functionality. Signed-off-by: Dave Chinner Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/Makefile | 1 - fs/xfs/xfs_dfrag.c | 10 ++++------ fs/xfs/xfs_file.c | 23 +++++++++++----------- fs/xfs/xfs_fs_subr.c | 46 ------------------------------------------- fs/xfs/xfs_vnodeops.c | 11 +++++------ fs/xfs/xfs_vnodeops.h | 2 -- 6 files changed, 21 insertions(+), 72 deletions(-) delete mode 100644 fs/xfs/xfs_fs_subr.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index e65357bb3dc6..d02201df855b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -37,7 +37,6 @@ xfs-y += xfs_aops.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ - xfs_fs_subr.o \ xfs_globals.o \ xfs_icache.o \ xfs_ioctl.o \ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b2c63a28afa7..d0e9c74d3d96 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,12 +246,10 @@ xfs_swap_extents( goto out_unlock; } - if (VN_CACHED(VFS_I(tip)) != 0) { - error = xfs_flushinval_pages(tip, 0, -1, - FI_REMAPF_LOCKED); - if (error) - goto out_unlock; - } + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(ip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index daf4066c24b2..c42f99e71f14 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -255,15 +255,14 @@ xfs_file_aio_read( xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((iocb->ki_pos & target->bt_smask) || - (size & target->bt_smask)) { - if (iocb->ki_pos == i_size_read(inode)) + if ((pos & target->bt_smask) || (size & target->bt_smask)) { + if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } - n = mp->m_super->s_maxbytes - iocb->ki_pos; + n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; @@ -289,20 +288,21 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, - (iocb->ki_pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + pos, -1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } + truncate_pagecache_range(VFS_I(ip), pos, -1); } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); + trace_xfs_file_read(ip, size, pos, ioflags); - ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); + ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -670,10 +670,11 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, - FI_REMAPF_LOCKED); + ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + pos, -1); if (ret) goto out; + truncate_pagecache_range(VFS_I(ip), pos, -1); } /* diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c deleted file mode 100644 index b5380893728e..000000000000 --- a/fs/xfs/xfs_fs_subr.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_vnodeops.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" -#include "xfs_trace.h" - -/* - * note: all filemap functions return negative error codes. These - * need to be inverted before returning to the xfs core functions. - */ -int -xfs_flushinval_pages( - xfs_inode_t *ip, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - struct address_space *mapping = VFS_I(ip)->i_mapping; - int ret = 0; - - trace_xfs_pagecache_inval(ip, first, last); - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_write_and_wait_range(mapping, first, - last == -1 ? LLONG_MAX : last); - if (!ret) - truncate_inode_pages_range(mapping, first, last); - return -ret; -} diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c00326afa7bf..81c61fd17890 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1958,12 +1958,11 @@ xfs_free_file_space( rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); - - if (VN_CACHED(VFS_I(ip)) != 0) { - error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_iolock; - } + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ioffset, -1); + if (error) + goto out_unlock_iolock; + truncate_pagecache_range(VFS_I(ip), ioffset, -1); /* * Need to zero the stuff we're not freeing, on disk. diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 73cb3cb15f75..91a03fa3814f 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -48,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, - xfs_off_t last, int fiopt); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); From c3f8fc73ac97b76a12692088ef9cace9af8422c0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:01 +1100 Subject: [PATCH 49/78] xfs: make buffer read verication an IO completion function Add a verifier function callback capability to the buffer read interfaces. This will be used by the callers to supply a function that verifies the contents of the buffer when it is read from disk. This patch does not provide callback functions, but simply modifies the interfaces to allow them to be called. The reason for adding this to the read interfaces is that it is very difficult to tell fom the outside is a buffer was just read from disk or whether we just pulled it out of cache. Supplying a callbck allows the buffer cache to use it's internal knowledge of the buffer to execute it only when the buffer is read from disk. It is intended that the verifier functions will mark the buffer with an EFSCORRUPTED error when verification fails. This allows the reading context to distinguish a verification error from an IO error, and potentially take further actions on the buffer (e.g. attempt repair) based on the error reported. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 4 ++-- fs/xfs/xfs_attr.c | 2 +- fs/xfs/xfs_btree.c | 21 ++++++++++++--------- fs/xfs/xfs_buf.c | 13 +++++++++---- fs/xfs/xfs_buf.h | 20 ++++++++++++-------- fs/xfs/xfs_da_btree.c | 4 ++-- fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_dquot.c | 4 ++-- fs/xfs/xfs_fsops.c | 4 ++-- fs/xfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 3 +-- fs/xfs/xfs_log_recover.c | 8 +++++--- fs/xfs/xfs_mount.c | 6 +++--- fs/xfs/xfs_qm.c | 5 +++-- fs/xfs/xfs_rtalloc.c | 6 +++--- fs/xfs/xfs_trans.h | 19 ++++++++----------- fs/xfs/xfs_trans_buf.c | 9 ++++++--- fs/xfs/xfs_vnodeops.c | 2 +- 19 files changed, 75 insertions(+), 61 deletions(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 335206a9c698..21c3db08fd01 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -447,7 +447,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -2110,7 +2110,7 @@ xfs_read_agf( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp); + XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL); if (error) return error; if (!*bpp) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 55bbe98e8f82..474c57a43cce 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1994,7 +1994,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp); + dblkno, blkcnt, 0, &bp, NULL); if (error) return(error); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 121ea99e615a..7e791160092d 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -266,9 +266,12 @@ xfs_btree_dup_cursor( for (i = 0; i < new->bc_nlevels; i++) { new->bc_ptrs[i] = cur->bc_ptrs[i]; new->bc_ra[i] = cur->bc_ra[i]; - if ((bp = cur->bc_bufs[i])) { - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, NULL); + if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; @@ -624,10 +627,10 @@ xfs_btree_read_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, NULL); + if (error) return error; - } ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); @@ -650,7 +653,7 @@ xfs_btree_reada_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); } /* @@ -670,7 +673,7 @@ xfs_btree_reada_bufs( ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); } STATIC int @@ -1013,7 +1016,7 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp); + mp->m_bsize, flags, bpp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4b0b8dd1b7b0..0298dd684798 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -654,7 +654,8 @@ xfs_buf_read_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { struct xfs_buf *bp; @@ -666,6 +667,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); + bp->b_iodone = verify; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -691,13 +693,14 @@ void xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, - int nmaps) + int nmaps, + xfs_buf_iodone_t verify) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); } /* @@ -709,7 +712,8 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags) + int flags, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; int error; @@ -723,6 +727,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; + bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7c0b6a0a1557..677b1dc822f4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -100,6 +100,7 @@ typedef struct xfs_buftarg { struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + #define XB_PAGES 2 struct xfs_buf_map { @@ -159,7 +160,6 @@ typedef struct xfs_buf { #endif } xfs_buf_t; - /* Finding and Reading Buffers */ struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -196,9 +196,10 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, xfs_buf_flags_t flags); struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); + xfs_buf_flags_t flags, xfs_buf_iodone_t verify); void xfs_buf_readahead_map(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps); + struct xfs_buf_map *map, int nmaps, + xfs_buf_iodone_t verify); static inline struct xfs_buf * xfs_buf_get( @@ -216,20 +217,22 @@ xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, - xfs_buf_flags_t flags) + xfs_buf_flags_t flags, + xfs_buf_iodone_t verify) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags); + return xfs_buf_read_map(target, &map, 1, flags, verify); } static inline void xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks) + size_t numblks, + xfs_buf_iodone_t verify) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_readahead_map(target, &map, 1); + return xfs_buf_readahead_map(target, &map, 1, verify); } struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); @@ -239,7 +242,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags); + xfs_daddr_t daddr, size_t numblks, int flags, + xfs_buf_iodone_t verify); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index c62e7e6ff50e..4af8bad7068c 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2161,7 +2161,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp); + mapp, nmap, 0, &bp, NULL); if (error) goto out_free; @@ -2237,7 +2237,7 @@ xfs_da_reada_buf( } mappedbno = mapp[0].bm_bn; - xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL); out_free: if (mapp != &map) diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 0b296253bd01..bac86984e403 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -926,7 +926,7 @@ xfs_dir2_leaf_readbuf( XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + mip->ra_offset), - (int)BTOBB(mp->m_dirblksize)); + (int)BTOBB(mp->m_dirblksize), NULL); mip->ra_current = i; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index bf27fcca4843..e95f800333d4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -439,7 +439,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp); + 0, &bp, NULL); if (error || !bp) return XFS_ERROR(error); } @@ -920,7 +920,7 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index bd9cb7f0b073..5440768ec41c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -168,7 +168,7 @@ xfs_growfs_data_private( dpct = pct - mp->m_sb.sb_imax_pct; bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; xfs_buf_relse(bp); @@ -439,7 +439,7 @@ xfs_growfs_data_private( if (agno < oagcount) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 37753e1c8537..12e3dead439d 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1490,7 +1490,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp); + XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7449cb943efd..8d6963010489 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -408,7 +408,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp); + (int)imap->im_len, buf_flags, &bp, NULL); if (error) { if (error != EAGAIN) { xfs_warn(mp, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46b6986e39b0..1d6d2ee08495 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1129,8 +1129,7 @@ xlog_iodone(xfs_buf_t *bp) * with it being freed after writing the unmount record to the * log. */ - -} /* xlog_iodone */ +} /* * Return size of each in-core log record buffer. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3e06333d4bd1..eb1e29ff0c7c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2144,7 +2144,7 @@ xlog_recover_buffer_pass2( buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags); + buf_flags, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -2237,7 +2237,8 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + NULL); if (!bp) { error = ENOMEM; goto error; @@ -2548,7 +2549,8 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_len == 1); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); if (error) return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 41ae7e1590f5..d5402b0eb6a3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -652,7 +652,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0); + BTOBB(sector_size), 0, NULL); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -1002,7 +1002,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1017,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp) } bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 48c750b0e830..688f608b3668 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -892,7 +892,7 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); if (error) break; @@ -979,7 +979,8 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - mp->m_quotainfo->qi_dqchunklen); + mp->m_quotainfo->qi_dqchunklen, + NULL); rablkno++; } } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a69e0b4750a9..b271ed939d7b 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -870,7 +870,7 @@ xfs_rtbuf_get( ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), - mp->m_bsize, 0, &bp); + mp->m_bsize, 0, &bp, NULL); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -1873,7 +1873,7 @@ xfs_growfs_rt( */ bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; xfs_buf_relse(bp); @@ -2220,7 +2220,7 @@ xfs_rtmount_init( } bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) { xfs_warn(mp, "realtime device size check failed"); return EIO; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index db056544cbb5..f02d40296506 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -464,10 +464,7 @@ xfs_trans_get_buf( int numblks, uint flags) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_trans_get_buf_map(tp, target, &map, 1, flags); } @@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp); + struct xfs_buf **bpp, + xfs_buf_iodone_t verify); static inline int xfs_trans_read_buf( @@ -486,13 +484,12 @@ xfs_trans_read_buf( xfs_daddr_t blkno, int numblks, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + xfs_buf_iodone_t verify) { - struct xfs_buf_map map = { - .bm_bn = blkno, - .bm_len = numblks, - }; - return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp); + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, + flags, bpp, verify); } struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6311b99c267f..977628207b45 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -257,7 +257,8 @@ xfs_trans_read_buf_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf **bpp) + struct xfs_buf **bpp, + xfs_buf_iodone_t verify) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; @@ -265,7 +266,7 @@ xfs_trans_read_buf_map( *bpp = NULL; if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, verify); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -312,7 +313,9 @@ xfs_trans_read_buf_map( if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); + ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); + bp->b_iodone = verify; xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -349,7 +352,7 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags); + bp = xfs_buf_read_map(target, map, nmaps, flags, verify); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 81c61fd17890..26880793feca 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -80,7 +80,7 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; From eab4e63368b4cfa597dbdac66d1a7a836a693b7d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:02 +1100 Subject: [PATCH 50/78] xfs: uncached buffer reads need to return an error With verification being done as an IO completion callback, different errors can be returned from a read. Uncached reads only return a buffer or NULL on failure, which means the verification error cannot be returned to the caller. Split the error handling for these reads into two - a failure to get a buffer will still return NULL, but a read error will return a referenced buffer with b_error set rather than NULL. The caller is responsible for checking the error state of the buffer returned. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 9 ++------- fs/xfs/xfs_fsops.c | 5 +++++ fs/xfs/xfs_mount.c | 6 ++++++ fs/xfs/xfs_rtalloc.c | 9 ++++++++- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0298dd684798..fbc965fc075a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -715,8 +715,7 @@ xfs_buf_read_uncached( int flags, xfs_buf_iodone_t verify) { - xfs_buf_t *bp; - int error; + struct xfs_buf *bp; bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) @@ -730,11 +729,7 @@ xfs_buf_read_uncached( bp->b_iodone = verify; xfsbdstrat(target->bt_mount, bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_relse(bp); - return NULL; - } + xfs_buf_iowait(bp); return bp; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5440768ec41c..f35f8d7731f0 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -171,6 +171,11 @@ xfs_growfs_data_private( XFS_FSS_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d5402b0eb6a3..df6d0b2aade1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,6 +658,12 @@ reread: xfs_warn(mp, "SB buffer read failed"); return EIO; } + if (bp->b_error) { + error = bp->b_error; + if (loud) + xfs_warn(mp, "SB validate failed"); + goto release_buf; + } /* * Initialize the mount structure from the superblock. diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b271ed939d7b..98dc670d3ee0 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1876,6 +1876,11 @@ xfs_growfs_rt( XFS_FSB_TO_BB(mp, 1), 0, NULL); if (!bp) return EIO; + if (bp->b_error) { + error = bp->b_error; + xfs_buf_relse(bp); + return error; + } xfs_buf_relse(bp); /* @@ -2221,8 +2226,10 @@ xfs_rtmount_init( bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) { + if (!bp || bp->b_error) { xfs_warn(mp, "realtime device size check failed"); + if (bp) + xfs_buf_relse(bp); return EIO; } xfs_buf_relse(bp); From 98021821a502db347bd9c7671beeee6e8ce07ea6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:03 +1100 Subject: [PATCH 51/78] xfs: verify superblocks as they are read from disk Add a superblock verify callback function and pass it into the buffer read functions. Remove the now redundant verification code that is currently in use. Adding verification shows that secondary superblocks never have their "sb_inprogress" flag cleared by mkfs.xfs, so when validating the secondary superblocks during a grow operation we have to avoid checking this field. Even if we fix mkfs, we will still have to ignore this field for verification purposes unless a version of mkfs that does not have this bug was used. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_fsops.c | 4 +- fs/xfs/xfs_log_recover.c | 5 +- fs/xfs/xfs_mount.c | 98 +++++++++++++++++++++++++--------------- fs/xfs/xfs_mount.h | 3 +- 4 files changed, 69 insertions(+), 41 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f35f8d7731f0..cb65b067ed31 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -444,7 +444,8 @@ xfs_growfs_data_private( if (agno < oagcount) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &bp, + xfs_sb_read_verify); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), @@ -462,6 +463,7 @@ xfs_growfs_data_private( break; } xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); + /* * If we get an error writing out the alternate superblocks, * just issue a warning and continue. The real work is diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index eb1e29ff0c7c..924a4bc3d49a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3692,13 +3692,14 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); + bp->b_iodone = xfs_sb_read_verify; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -3710,7 +3711,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index df6d0b2aade1..bff18d73c610 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -304,9 +304,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - int flags) + bool check_inprogress) { - int loud = !(flags & XFS_MFSI_QUIET); /* * If the log device and data device have the @@ -316,21 +315,18 @@ xfs_mount_validate_sb( * a volume filesystem in a non-volume manner. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { - if (loud) - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, "bad magic number"); return XFS_ERROR(EWRONGFS); } if (!xfs_sb_good_version(sbp)) { - if (loud) - xfs_warn(mp, "bad version"); + xfs_warn(mp, "bad version"); return XFS_ERROR(EWRONGFS); } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an external log; " "specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -338,8 +334,7 @@ xfs_mount_validate_sb( if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "filesystem is marked as having an internal log; " "do not specify logdev on the mount command line."); return XFS_ERROR(EINVAL); @@ -373,8 +368,7 @@ xfs_mount_validate_sb( sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { - if (loud) - XFS_CORRUPTION_ERROR("SB sanity check failed", + XFS_CORRUPTION_ERROR("SB sanity check failed", XFS_ERRLEVEL_LOW, mp, sbp); return XFS_ERROR(EFSCORRUPTED); } @@ -383,12 +377,10 @@ xfs_mount_validate_sb( * Until this is fixed only page-sized or smaller data blocks work. */ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - if (loud) { - xfs_warn(mp, + xfs_warn(mp, "File system with blocksize %d bytes. " "Only pagesize (%ld) or less will currently work.", sbp->sb_blocksize, PAGE_SIZE); - } return XFS_ERROR(ENOSYS); } @@ -402,23 +394,20 @@ xfs_mount_validate_sb( case 2048: break; default: - if (loud) - xfs_warn(mp, "inode size of %d bytes not supported", + xfs_warn(mp, "inode size of %d bytes not supported", sbp->sb_inodesize); return XFS_ERROR(ENOSYS); } if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - if (loud) - xfs_warn(mp, + xfs_warn(mp, "file system too large to be mounted on this system."); return XFS_ERROR(EFBIG); } - if (unlikely(sbp->sb_inprogress)) { - if (loud) - xfs_warn(mp, "file system busy"); + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); return XFS_ERROR(EFSCORRUPTED); } @@ -426,9 +415,7 @@ xfs_mount_validate_sb( * Version 1 directory format has never worked on Linux. */ if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - if (loud) - xfs_warn(mp, - "file system using version 1 directory format"); + xfs_warn(mp, "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } @@ -521,11 +508,9 @@ out_unwind: void xfs_sb_from_disk( - struct xfs_mount *mp, + struct xfs_sb *to, xfs_dsb_t *from) { - struct xfs_sb *to = &mp->m_sb; - to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); @@ -627,6 +612,50 @@ xfs_sb_to_disk( } } +void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + int error; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + if (error) + xfs_buf_ioerror(bp, error); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, the run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_sb sb; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + if (sb.sb_magicnum == XFS_SB_MAGIC) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, EFSCORRUPTED); +} + /* * xfs_readsb * @@ -652,7 +681,9 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, NULL); + BTOBB(sector_size), 0, + loud ? xfs_sb_read_verify + : xfs_sb_quiet_read_verify); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -667,15 +698,8 @@ reread: /* * Initialize the mount structure from the superblock. - * But first do some basic consistency checking. */ - xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp)); - error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); - if (error) { - if (loud) - xfs_warn(mp, "SB validate failed"); - goto release_buf; - } + xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); /* * We must be able to do sector-sized and sector-aligned IO. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index dc306a09f56f..de9089acc610 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -385,10 +385,11 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ +extern void xfs_sb_read_verify(struct xfs_buf *); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); -extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); #endif /* __XFS_MOUNT_H__ */ From 5d5f527d13369d0047d52b7ac4ddee4f8c0ad173 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:44:56 +1100 Subject: [PATCH 52/78] xfs: verify AGF blocks as they are read from disk Add an AGF block verify callback function and pass it into the buffer read functions. This replaces the existing verification that is done after the read completes. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 68 ++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 21c3db08fd01..c9eb955a49c7 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2091,6 +2091,47 @@ xfs_alloc_put_freelist( return 0; } +static void +xfs_agf_read_verify( + struct xfs_buf *bp) + { + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agf *agf; + int agf_ok; + + agf = XFS_BUF_TO_AGF(bp); + + agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) == + bp->b_pag->pag_agno; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= + be32_to_cpu(agf->agf_length); + + if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group header (free/alloc section). */ @@ -2102,44 +2143,19 @@ xfs_read_agf( int flags, /* XFS_BUF_ */ struct xfs_buf **bpp) /* buffer for the ag freelist header */ { - struct xfs_agf *agf; /* ag freelist header */ - int agf_ok; /* set if agf is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL); + XFS_FSS_TO_BB(mp, 1), flags, bpp, xfs_agf_read_verify); if (error) return error; if (!*bpp) return 0; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); - - /* - * Validate the magic number of the agf block. - */ - agf_ok = - agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_seqno) == agno; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) - agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= - be32_to_cpu(agf->agf_length); - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", - XFS_ERRLEVEL_LOW, mp, agf); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } xfs_buf_set_ref(*bpp, XFS_AGF_REF); return 0; } From 3702ce6ed71cd60451ab278088863456dcb0dd99 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:05 +1100 Subject: [PATCH 53/78] xfs: verify AGI blocks as they are read from disk Add an AGI block verify callback function and pass it into the buffer read functions. Remove the now redundant verification code that is currently in use. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 56 ++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 12e3dead439d..5bd255e5f7b8 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1472,6 +1472,40 @@ xfs_check_agi_unlinked( #define xfs_check_agi_unlinked(agi) #endif +static void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int agi_ok; + + /* + * Validate the magic number of the agi block. + */ + agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag) + agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) == + bp->b_pag->pag_agno; + + if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI))) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + xfs_check_agi_unlinked(agi); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group header (inode allocation section) */ @@ -1482,38 +1516,18 @@ xfs_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp) /* allocation group hdr buf */ { - struct xfs_agi *agi; /* allocation group header */ - int agi_ok; /* agi is consistent */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, bpp, xfs_agi_read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - agi = XFS_BUF_TO_AGI(*bpp); - - /* - * Validate the magic number of the agi block. - */ - agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) && - be32_to_cpu(agi->agi_seqno) == agno; - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW, - mp, agi); - xfs_trans_brelse(tp, *bpp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_buf_set_ref(*bpp, XFS_AGI_REF); - - xfs_check_agi_unlinked(agi); return 0; } From bb80c6d79a3b0f9b6c3236a4bec021c72615bfd1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:06 +1100 Subject: [PATCH 54/78] xfs: verify AGFL blocks as they are read from disk Add an AGFL block verify callback function and pass it into the buffer read functions. While this commit adds verification code to the AGFL, it cannot be used reliably until the CRC format change comes along as mkfs does not initialise the full AGFL. Hence it can be full of garbage at the first mount and will fail verification right now. CRC enabled filesystems won't have this problem, so leave the code that has already been written ifdef'd out until the proper time. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index c9eb955a49c7..38b4ab8957ff 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,6 +430,43 @@ xfs_alloc_fixup_trees( return 0; } +void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ +#ifdef WHEN_CRCS_COME_ALONG + /* + * we cannot actually do any verification of the AGFL because mkfs does + * not initialise the AGFL to zero or NULL. Hence the only valid part of + * the AGFL is what the AGF says is active. We can't get to the AGF, so + * we can't verify just those entries are valid. + * + * This problem goes away when the CRC format change comes along as that + * requires the AGFL to be initialised by mkfs. At that point, we can + * verify the blocks in the agfl -active or not- lie within the bounds + * of the AG. Until then, just leave this check ifdef'd out. + */ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int agfl_ok = 1; + + int i; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK || + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + agfl_ok = 0; + } + + if (!agfl_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +#endif + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * Read in the allocation group free block array. */ @@ -447,7 +484,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), 0, &bp, xfs_agfl_read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); From af133e8606d32c2aed43870491ebbdc56feec8a8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:07 +1100 Subject: [PATCH 55/78] xfs: verify inode buffers as they are read from disk Add an inode buffer verify callback function and pass it into the buffer read functions. Inodes are special in that the verbose checks will be done when reading the inode, but we still need to sanity check the buffer when that is first read. Always verify the magic numbers in all inodes in the buffer, rather than jus ton debug kernels. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_inode.c | 102 +++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 50 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8d6963010489..514eac913f1c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,6 +382,46 @@ xfs_inobp_check( } #endif +static void +xfs_inode_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, + mp, dip); +#ifdef DEBUG + xfs_emerg(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); + ASSERT(0); +#endif + } + } + xfs_inobp_check(mp, bp); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + /* * This routine is called to map an inode to the buffer containing the on-disk * version of the inode. It returns a pointer to the buffer containing the @@ -396,71 +436,33 @@ xfs_imap_to_bp( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, - struct xfs_dinode **dipp, + struct xfs_dinode **dipp, struct xfs_buf **bpp, uint buf_flags, uint iget_flags) { struct xfs_buf *bp; int error; - int i; - int ni; buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - (int)imap->im_len, buf_flags, &bp, NULL); + (int)imap->im_len, buf_flags, &bp, + xfs_inode_buf_verify); if (error) { - if (error != EAGAIN) { - xfs_warn(mp, - "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); - } else { + if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); + return error; } + + if (error == EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return XFS_ERROR(EINVAL); + + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); return error; } - /* - * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. - */ -#ifdef DEBUG - ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; -#endif - - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (iget_flags & XFS_IGET_UNTRUSTED) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); -#ifdef DEBUG - xfs_emerg(mp, - "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)imap->im_blkno, i, - be16_to_cpu(dip->di_magic)); - ASSERT(0); -#endif - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } - } - - xfs_inobp_check(mp, bp); - *bpp = bp; *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); return 0; From 3d3e6f64e22c94115d47de670611bcd3ecda3796 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:08 +1100 Subject: [PATCH 56/78] xfs: verify btree blocks as they are read from disk Add an btree block verify callback function and pass it into the buffer read functions. Because each different btree block type requires different verification, add a function to the ops structure that is called from the generic code. Also, propagate the verification callback functions through the readahead functions, and into the external bmap and bulkstat inode readahead code that uses the generic btree buffer read functions. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc_btree.c | 61 +++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_bmap.c | 60 +++++++++++++++++++++--------------- fs/xfs/xfs_bmap_btree.c | 47 ++++++++++++++++++++++++++++ fs/xfs/xfs_bmap_btree.h | 1 + fs/xfs/xfs_btree.c | 64 ++++++++++++++++++++------------------- fs/xfs/xfs_btree.h | 10 ++++-- fs/xfs/xfs_ialloc_btree.c | 40 ++++++++++++++++++++++++ fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_itable.c | 3 +- 10 files changed, 229 insertions(+), 60 deletions(-) diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index f7876c6d6165..46961e52e9b8 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -272,6 +272,66 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level as the + * perag is not fully initialised and hence not attached to the buffer. + * In this case, check against the maximum tree depth. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag) + sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi]; + else + sblock_ok = level < mp->m_ag_maxlevels; + break; + default: + sblock_ok = 0; + break; + } + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_allocbt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -327,6 +387,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, + .read_verify = xfs_allocbt_read_verify, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index a60f3d1f151c..9ae7aba52e0f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2662,8 +2662,9 @@ xfs_bmap_btree_to_extents( if ((error = xfs_btree_check_lptr(cur, cbno, 1))) return error; #endif - if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) @@ -4078,8 +4079,9 @@ xfs_bmap_read_extents( * pointer (leftmost) at each level. */ while (level-- > 0) { - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( @@ -4124,7 +4126,8 @@ xfs_bmap_read_extents( */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1); + xfs_btree_reada_bufl(mp, nextbno, 1, + xfs_bmbt_read_verify); /* * Copy records into the extent records. */ @@ -4156,8 +4159,9 @@ xfs_bmap_read_extents( */ if (bno == NULLFSBLOCK) break; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + if (error) return error; block = XFS_BUF_TO_BLOCK(bp); } @@ -5868,15 +5872,16 @@ xfs_bmap_check_leaf_extents( */ while (level-- > 0) { /* See if buf is in cur first */ + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( xfs_bmap_sanity_check(mp, bp, level), @@ -5953,15 +5958,16 @@ xfs_bmap_check_leaf_extents( if (bno == NULLFSBLOCK) break; + bp_release = 0; bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { + if (!bp) { bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) + goto error_norelse; } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; block = XFS_BUF_TO_BLOCK(bp); } if (bp_release) { @@ -6052,7 +6058,9 @@ xfs_bmap_count_tree( struct xfs_btree_block *block, *nextblock; int numrecs; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); @@ -6061,8 +6069,10 @@ xfs_bmap_count_tree( /* Not at node above leaves, count this level of nodes */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); while (nextbno != NULLFSBLOCK) { - if ((error = xfs_btree_read_bufl(mp, tp, nextbno, - 0, &nbp, XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; nextblock = XFS_BUF_TO_BLOCK(nbp); @@ -6091,8 +6101,10 @@ xfs_bmap_count_tree( if (nextbno == NULLFSBLOCK) break; bno = nextbno; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + xfs_bmbt_read_verify); + if (error) return error; *count += 1; block = XFS_BUF_TO_BLOCK(bp); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 862084a47a7e..bddca9b92869 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -36,6 +36,7 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_trace.h" /* * Determine the extent state. @@ -707,6 +708,51 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } +void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int lblock_ok; /* block passes checks */ + + /* magic number and level verification. + * + * We don't know waht fork we belong to, so just verify that the level + * is less than the maximum of the two. Later checks will be more + * precise. + */ + level = be16_to_cpu(block->bb_level); + lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) && + level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]); + + /* numrecs verification */ + lblock_ok = lblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0]; + + /* sibling pointer verification */ + lblock_ok = lblock_ok && + block->bb_u.l.bb_leftsib && + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + + if (!lblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_bmbt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_bmbt_keys_inorder( @@ -746,6 +792,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, + .read_verify = xfs_bmbt_read_verify, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 0e66c4ea0f85..1d00fbe9dd79 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -232,6 +232,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern void xfs_bmbt_read_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 7e791160092d..ef1066078c33 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -270,7 +270,8 @@ xfs_btree_dup_cursor( if (bp) { error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_BUF_ADDR(bp), mp->m_bsize, - 0, &bp, NULL); + 0, &bp, + cur->bc_ops->read_verify); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -612,23 +613,24 @@ xfs_btree_offsets( * Get a buffer for the block, return it read in. * Long-form addressing. */ -int /* error */ +int xfs_btree_read_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for fsbno */ - int refval) /* ref count value for buffer */ + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + xfs_buf_iodone_t verify) { - xfs_buf_t *bp; /* return value */ + struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ - int error; + int error; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp, NULL); + mp->m_bsize, lock, &bp, verify); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -645,15 +647,16 @@ xfs_btree_read_bufl( /* ARGSUSED */ void xfs_btree_reada_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); } /* @@ -663,17 +666,18 @@ xfs_btree_reada_bufl( /* ARGSUSED */ void xfs_btree_reada_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); } STATIC int @@ -687,12 +691,14 @@ xfs_btree_readahead_lblock( xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, left, 1); + xfs_btree_reada_bufl(cur->bc_mp, left, 1, + cur->bc_ops->read_verify); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, right, 1); + xfs_btree_reada_bufl(cur->bc_mp, right, 1, + cur->bc_ops->read_verify); rval++; } @@ -712,13 +718,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - left, 1); + left, 1, cur->bc_ops->read_verify); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - right, 1); + right, 1, cur->bc_ops->read_verify); rval++; } @@ -1016,19 +1022,15 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp, NULL); + mp->m_bsize, flags, bpp, + cur->bc_ops->read_verify); if (error) return error; ASSERT(!xfs_buf_geterror(*bpp)); - xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); - - error = xfs_btree_check_block(cur, *block, level, *bpp); - if (error) - xfs_trans_brelse(cur->bc_tp, *bpp); - return error; + return 0; } /* diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index c9cf2d00e236..3a4c314047a0 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -188,6 +188,7 @@ struct xfs_btree_ops { __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); + void (*read_verify)(struct xfs_buf *bp); #ifdef DEBUG /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, @@ -355,7 +356,8 @@ xfs_btree_read_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ - int refval);/* ref count value for buffer */ + int refval, /* ref count value for buffer */ + xfs_buf_iodone_t verify); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -365,7 +367,8 @@ void /* error */ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -376,7 +379,8 @@ xfs_btree_reada_bufs( struct xfs_mount *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + xfs_buf_iodone_t verify); /* * Initialise a new btree block header diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 2b8b7a37aa18..11306c6d61c7 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -33,6 +33,7 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_trace.h" STATIC int @@ -181,6 +182,44 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } +void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + int sblock_ok; /* block passes checks */ + + /* magic number and level verification */ + level = be16_to_cpu(block->bb_level); + sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) && + level < mp->m_in_maxlevels; + + /* numrecs verification */ + sblock_ok = sblock_ok && + be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0]; + + /* sibling pointer verification */ + sblock_ok = sblock_ok && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) && + block->bb_u.s.bb_rightsib; + + if (!sblock_ok) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_CORRUPTION_ERROR("xfs_inobt_read_verify", + XFS_ERRLEVEL_LOW, mp, block); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + #ifdef DEBUG STATIC int xfs_inobt_keys_inorder( @@ -218,6 +257,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, + .read_verify = xfs_inobt_read_verify, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 514eac913f1c..3a243d076950 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,7 +382,7 @@ xfs_inobp_check( } #endif -static void +void xfs_inode_buf_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 21b4de3df716..1a892114792f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,6 +554,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); +void xfs_inode_buf_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 3998fd2a7949..0f18d412e3e8 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -396,7 +396,8 @@ xfs_bulkstat( if (xfs_inobt_maskn(chunkidx, nicluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster); + agbno, nbcluster, + xfs_inode_buf_verify); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; From c6319198702350a2215a8c0cacd6cc4283728a1b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:50:13 +1100 Subject: [PATCH 57/78] xfs: verify dquot blocks as they are read from disk Add a dquot buffer verify callback function and pass it into the buffer read functions. This checks all the dquots in a buffer, but cannot completely verify the dquot ids are correct. Also, errors cannot be repaired, so an additional function is added to repair bad dquots in the buffer if such an error is detected in a context where repair is allowed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dquot.c | 119 ++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_dquot.h | 1 + fs/xfs/xfs_qm.c | 3 +- 3 files changed, 99 insertions(+), 24 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e95f800333d4..0ba0f0992d6e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -360,6 +360,89 @@ xfs_qm_dqalloc( return (error); } +void +xfs_dquot_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_disk_dquot *ddq; + xfs_dqid_t id = 0; + int i; + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_read_verify"); + if (error) { + XFS_CORRUPTION_ERROR("xfs_dquot_read_verify", + XFS_ERRLEVEL_LOW, mp, d); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + } + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +STATIC int +xfs_qm_dqrepair( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_dquot *dqp, + xfs_dqid_t firstid, + struct xfs_buf **bpp) +{ + int error; + struct xfs_disk_dquot *ddq; + struct xfs_dqblk *d; + int i; + + /* + * Read the buffer without verification so we get the corrupted + * buffer returned to us. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, bpp, NULL); + + if (error) { + ASSERT(*bpp == NULL); + return XFS_ERROR(error); + } + + ASSERT(xfs_buf_islocked(*bpp)); + d = (struct xfs_dqblk *)(*bpp)->b_addr; + + /* Do the actual repair of dquots in this buffer */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + ddq = &d[i].dd_diskdq; + error = xfs_qm_dqcheck(mp, ddq, firstid + i, + dqp->dq_flags & XFS_DQ_ALLTYPES, + XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); + if (error) { + /* repair failed, we're screwed */ + xfs_trans_brelse(tp, *bpp); + return XFS_ERROR(EIO); + } + } + + return 0; +} + /* * Maps a dquot to the buffer containing its on-disk version. * This returns a ptr to the buffer containing the on-disk dquot @@ -378,7 +461,6 @@ xfs_qm_dqtobp( xfs_buf_t *bp; xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); xfs_mount_t *mp = dqp->q_mount; - xfs_disk_dquot_t *ddq; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); xfs_trans_t *tp = (tpp ? *tpp : NULL); @@ -439,33 +521,24 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, NULL); - if (error || !bp) + 0, &bp, xfs_dquot_read_verify); + + if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * + mp->m_quotainfo->qi_dqperchunk; + ASSERT(bp == NULL); + error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); + } + + if (error) { + ASSERT(bp == NULL); return XFS_ERROR(error); - } - - ASSERT(xfs_buf_islocked(bp)); - - /* - * calculate the location of the dquot inside the buffer. - */ - ddq = bp->b_addr + dqp->q_bufoffset; - - /* - * A simple sanity check in case we got a corrupted dquot... - */ - error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, - flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), - "dqtobp"); - if (error) { - if (!(flags & XFS_QMOPT_DQREPAIR)) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EIO); } } + ASSERT(xfs_buf_islocked(bp)); *O_bpp = bp; - *O_ddpp = ddq; + *O_ddpp = bp->b_addr + dqp->q_bufoffset; return (0); } diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 7d20af27346d..a08ba92d7da0 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,6 +140,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); +extern void xfs_dquot_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 688f608b3668..a6dfb97490cc 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -892,7 +892,8 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + xfs_dquot_read_verify); if (error) break; From 4bb20a83a2a5ac4dcb62780c9950e47939956126 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:10 +1100 Subject: [PATCH 58/78] xfs: add verifier callback to directory read code Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 23 ++++++++++----------- fs/xfs/xfs_attr_leaf.c | 18 ++++++++--------- fs/xfs/xfs_da_btree.c | 44 ++++++++++++++++++++++++++--------------- fs/xfs/xfs_da_btree.h | 7 ++++--- fs/xfs/xfs_dir2_block.c | 23 ++++++++++----------- fs/xfs/xfs_dir2_leaf.c | 33 +++++++++++++++---------------- fs/xfs/xfs_dir2_node.c | 43 +++++++++++++++++++--------------------- fs/xfs/xfs_file.c | 2 +- 8 files changed, 102 insertions(+), 91 deletions(-) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 474c57a43cce..cd5a9cd0ded0 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -904,7 +904,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) dp = args->dp; args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1032,7 +1032,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * remove the "old" attr from that block (neat, huh!) */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1101,7 +1101,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) dp = args->dp; args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -1159,7 +1159,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args) args->blkno = 0; error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1190,7 +1190,8 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK, + NULL); if (error) return XFS_ERROR(error); ASSERT(bp != NULL); @@ -1605,7 +1606,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) state->path.blk[0].bp = NULL; error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) goto out; ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == @@ -1718,7 +1719,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) error = xfs_da_read_buf(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + &blk->bp, XFS_ATTR_FORK, NULL); if (error) return(error); } else { @@ -1737,7 +1738,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) error = xfs_da_read_buf(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); + &blk->bp, XFS_ATTR_FORK, NULL); if (error) return(error); } else { @@ -1827,7 +1828,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) bp = NULL; if (cursor->blkno > 0) { error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { @@ -1870,7 +1871,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) for (;;) { error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); if (unlikely(bp == NULL)) { @@ -1937,7 +1938,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_trans_brelse(NULL, bp); error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); if (unlikely((bp == NULL))) { diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 4bfc732bc9c9..ba2b9a2cd236 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -871,7 +871,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) if (error) goto out; error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) goto out; ASSERT(bp1 != NULL); @@ -1642,7 +1642,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) if (blkno == 0) continue; error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK); + blkno, -1, &bp, XFS_ATTR_FORK, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -2519,7 +2519,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) * Set up the operation. */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2584,7 +2584,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) * Set up the operation. */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2641,7 +2641,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) * Read the block containing the "old" attr */ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2652,7 +2652,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) */ if (args->blkno2 != args->blkno) { error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK); + -1, &bp2, XFS_ATTR_FORK, NULL); if (error) { return(error); } @@ -2753,7 +2753,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK, NULL); if (error) return(error); blkno = XFS_BUF_ADDR(bp); @@ -2839,7 +2839,7 @@ xfs_attr_node_inactive( * before we come back to this one. */ error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK); + XFS_ATTR_FORK, NULL); if (error) return(error); if (child_bp) { @@ -2880,7 +2880,7 @@ xfs_attr_node_inactive( */ if ((i+1) < count) { error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); + &bp, XFS_ATTR_FORK, NULL); if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 4af8bad7068c..f9e9149de009 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -747,7 +747,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) child = be32_to_cpu(oldroot->btree[0].before); ASSERT(child != 0); error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, - args->whichfork); + args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -838,7 +838,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) if (blkno == 0) continue; error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, state->args->whichfork); + blkno, -1, &bp, state->args->whichfork, + NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1084,7 +1085,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) */ blk->blkno = blkno; error = xfs_da_read_buf(args->trans, args->dp, blkno, - -1, &blk->bp, args->whichfork); + -1, &blk->bp, args->whichfork, NULL); if (error) { blk->blkno = 0; state->path.active--; @@ -1247,7 +1248,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (old_info->back) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(old_info->back), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1268,7 +1269,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, if (old_info->forw) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(old_info->forw), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1368,7 +1369,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (drop_info->back) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(drop_info->back), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1385,7 +1386,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, if (drop_info->forw) { error = xfs_da_read_buf(args->trans, args->dp, be32_to_cpu(drop_info->forw), - -1, &bp, args->whichfork); + -1, &bp, args->whichfork, NULL); if (error) return(error); ASSERT(bp != NULL); @@ -1470,7 +1471,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ blk->blkno = blkno; error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork); + &blk->bp, args->whichfork, NULL); if (error) return(error); ASSERT(blk->bp != NULL); @@ -1733,7 +1734,8 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w, NULL); + if (error) return error; /* * Copy the last block into the dead buffer and log it. @@ -1759,7 +1761,9 @@ xfs_da_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, + NULL); + if (error) goto done; sib_info = sib_buf->b_addr; if (unlikely( @@ -1780,7 +1784,9 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, + NULL); + if (error) goto done; sib_info = sib_buf->b_addr; if (unlikely( @@ -1803,7 +1809,9 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, + NULL); + if (error) goto done; par_node = par_buf->b_addr; if (unlikely(par_node->hdr.info.magic != @@ -1853,7 +1861,9 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, + NULL); + if (error) goto done; par_node = par_buf->b_addr; if (unlikely( @@ -2139,7 +2149,8 @@ xfs_da_read_buf( xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, - int whichfork) + int whichfork, + xfs_buf_iodone_t verifier) { struct xfs_buf *bp; struct xfs_buf_map map; @@ -2161,7 +2172,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp, NULL); + mapp, nmap, 0, &bp, verifier); if (error) goto out_free; @@ -2217,7 +2228,8 @@ xfs_da_reada_buf( struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, - int whichfork) + int whichfork, + xfs_buf_iodone_t verifier) { xfs_daddr_t mappedbno = -1; struct xfs_buf_map map; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 132adafb041e..bf8bfaa0d356 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -18,7 +18,6 @@ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ -struct xfs_buf; struct xfs_bmap_free; struct xfs_inode; struct xfs_mount; @@ -226,9 +225,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp, int whichfork); + struct xfs_buf **bpp, int whichfork, + xfs_buf_iodone_t verifier); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork); + xfs_dablk_t bno, int whichfork, + xfs_buf_iodone_t verifier); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e93ca8f054f4..53666ca6c953 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -97,10 +97,10 @@ xfs_dir2_block_addname( /* * Read the (one and only) directory block into dabuf bp. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); hdr = bp->b_addr; /* @@ -457,7 +457,7 @@ xfs_dir2_block_getdents( * Can't read the block, give up, else get dabuf in bp. */ error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, - &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK, NULL); if (error) return error; @@ -640,10 +640,10 @@ xfs_dir2_block_lookup_int( /* * Read the buffer, return error if we can't get it. */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); @@ -917,10 +917,11 @@ xfs_dir2_leaf_to_block( /* * Read the data block if we don't already have it, give up if it fails. */ - if (dbp == NULL && - (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK))) { - return error; + if (!dbp) { + error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, + XFS_DATA_FORK, NULL); + if (error) + return error; } hdr = dbp->b_addr; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index bac86984e403..86e3dc1de0e7 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -315,10 +315,9 @@ xfs_dir2_leaf_addname( * Read the leaf block. */ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); - if (error) { + XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(lbp != NULL); /* * Look up the entry by hash value and name. @@ -500,9 +499,9 @@ xfs_dir2_leaf_addname( * Just read that one in. */ else { - if ((error = - xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), - -1, &dbp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), + -1, &dbp, XFS_DATA_FORK, NULL); + if (error) { xfs_trans_brelse(tp, lbp); return error; } @@ -895,7 +894,7 @@ xfs_dir2_leaf_readbuf( error = xfs_da_read_buf(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, - &bp, XFS_DATA_FORK); + &bp, XFS_DATA_FORK, NULL); /* * Should just skip over the data block instead of giving up. @@ -938,7 +937,7 @@ xfs_dir2_leaf_readbuf( xfs_da_reada_buf(NULL, dp, map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_DATA_FORK); + XFS_DATA_FORK, NULL); mip->ra_current = i; } @@ -1376,7 +1375,7 @@ xfs_dir2_leaf_lookup_int( * Read the leaf block into the buffer. */ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); + XFS_DATA_FORK, NULL); if (error) return error; *lbpp = lbp; @@ -1411,7 +1410,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1453,7 +1452,7 @@ xfs_dir2_leaf_lookup_int( xfs_trans_brelse(tp, dbp); error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, cidb), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1738,10 +1737,10 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, + XFS_DATA_FORK, NULL); + if (error) return error; - } leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); @@ -1864,10 +1863,10 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, + XFS_DATA_FORK, NULL); + if (error) return error; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(!free->hdr.firstdb); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 6c7052406605..290c2b1016ab 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -399,7 +399,7 @@ xfs_dir2_leafn_lookup_for_addname( */ error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newfdb), - -1, &curbp, XFS_DATA_FORK); + -1, &curbp, XFS_DATA_FORK, NULL); if (error) return error; free = curbp->b_addr; @@ -536,7 +536,7 @@ xfs_dir2_leafn_lookup_for_entry( } else { error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &curbp, XFS_DATA_FORK); + -1, &curbp, XFS_DATA_FORK, NULL); if (error) return error; } @@ -915,10 +915,10 @@ xfs_dir2_leafn_remove( * read in the free block. */ fdb = xfs_dir2_db_to_fdb(mp, db); - if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), + -1, &fbp, XFS_DATA_FORK, NULL); + if (error) return error; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); ASSERT(be32_to_cpu(free->hdr.firstdb) == @@ -1169,11 +1169,10 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - if ((error = - xfs_da_read_buf(state->args->trans, state->args->dp, blkno, - -1, &bp, XFS_DATA_FORK))) { + error = xfs_da_read_buf(state->args->trans, state->args->dp, + blkno, -1, &bp, XFS_DATA_FORK, NULL); + if (error) return error; - } ASSERT(bp != NULL); /* * Count bytes in the two blocks combined. @@ -1454,14 +1453,13 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - if ((error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), -2, + &fbp, XFS_DATA_FORK, NULL); + if (error) return error; - } - if (unlikely(fbp == NULL)) { + if (!fbp) continue; - } free = fbp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); findex = 0; @@ -1520,9 +1518,9 @@ xfs_dir2_node_addname_int( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(mp, dbno); - if (unlikely(error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) + error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2, + &fbp, XFS_DATA_FORK, NULL); + if (error) return error; /* @@ -1631,7 +1629,7 @@ xfs_dir2_node_addname_int( * Read the data block in. */ error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK); + -1, &dbp, XFS_DATA_FORK, NULL); if (error) return error; hdr = dbp->b_addr; @@ -1917,11 +1915,10 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK))) { + error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, + XFS_DATA_FORK, NULL); + if (error) return error; - } - /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c42f99e71f14..f6dab7da7bcc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -891,7 +891,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); + xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK, NULL); xfs_iunlock(ip, mode); return 0; } From 20f7e9f3726a27cccade65c28265eef8ca50eecb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:11 +1100 Subject: [PATCH 59/78] xfs: factor dir2 block read operations In preparation for verifying dir2 block format buffers, factor the read operations out of the block operations (lookup, addname, getdents) and some of the additional logic to make it easier to understand an dmodify the code. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 386 ++++++++++++++++++++++------------------ 1 file changed, 209 insertions(+), 177 deletions(-) diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 53666ca6c953..25ce409487be 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -56,6 +56,178 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } +static int +xfs_dir2_block_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + + return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + XFS_DATA_FORK, NULL); +} + +static void +xfs_dir2_block_need_space( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + __be16 **tagpp, + struct xfs_dir2_data_unused **dupp, + struct xfs_dir2_data_unused **enddupp, + int *compact, + int len) +{ + struct xfs_dir2_data_free *bf; + __be16 *tagp = NULL; + struct xfs_dir2_data_unused *dup = NULL; + struct xfs_dir2_data_unused *enddup = NULL; + + *compact = 0; + bf = hdr->bestfree; + + /* + * If there are stale entries we'll use one for the leaf. + */ + if (btp->stale) { + if (be16_to_cpu(bf[0].length) >= len) { + /* + * The biggest entry enough to avoid compaction. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + goto out; + } + + /* + * Will need to compact to make this work. + * Tag just before the first leaf entry. + */ + *compact = 1; + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + goto out; + } + + /* + * no stale entries, so just use free space. + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + /* + * Check out the biggest freespace and see if it's the same one. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + if (dup != enddup) { + /* + * Not the same free entry, just check its length. + */ + if (be16_to_cpu(dup->length) < len) + dup = NULL; + goto out; + } + + /* + * It is the biggest freespace, can it hold the leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, use the second-largest entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } +out: + *tagpp = tagp; + *dupp = dup; + *enddupp = enddup; +} + +/* + * compact the leaf entries. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ +static void +xfs_dir2_block_compact( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + int *needlog, + int *lfloghigh, + int *lfloglow) +{ + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + int needscan = 0; + int highstale; /* high stale index */ + + fromidx = toidx = be32_to_cpu(btp->count) - 1; + highstale = *lfloghigh = -1; + for (; fromidx >= 0; fromidx--) { + if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (highstale == -1) + highstale = toidx; + else { + if (*lfloghigh == -1) + *lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + *lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(tp, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + needlog, &needscan); + blp += be32_to_cpu(btp->stale) - 1; + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) + xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog); +} + /* * Add an entry to a block directory. */ @@ -63,7 +235,6 @@ int /* error */ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { - xfs_dir2_data_free_t *bf; /* bestfree table in block */ xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ struct xfs_buf *bp; /* buffer for block */ @@ -94,134 +265,44 @@ xfs_dir2_block_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the (one and only) directory block into dabuf bp. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, - XFS_DATA_FORK, NULL); + + /* Read the (one and only) directory block into bp. */ + error = xfs_dir2_block_read(tp, dp, &bp); if (error) return error; - ASSERT(bp != NULL); - hdr = bp->b_addr; - /* - * Check the magic number, corrupted if wrong. - */ - if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", - XFS_ERRLEVEL_LOW, mp, hdr); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } + len = xfs_dir2_data_entsize(args->namelen); + /* * Set up pointers to parts of the block. */ - bf = hdr->bestfree; + hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(mp, hdr); blp = xfs_dir2_block_leaf_p(btp); + /* - * No stale entries? Need space for entry and new leaf. + * Find out if we can reuse stale entries or whether we need extra + * space for entry and new leaf. */ - if (!btp->stale) { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then can't do this add without cleaning up: - * the space before the first leaf entry needs to be free so it - * can be expanded to hold the pointer to the new entry. - */ - if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG) - dup = enddup = NULL; - /* - * Check out the biggest freespace and see if it's the same one. - */ - else { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - if (dup == enddup) { - /* - * It is the biggest freespace, is it too small - * to hold the new leaf too? - */ - if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { - /* - * Yes, we use the second-largest - * entry instead if it works. - */ - if (be16_to_cpu(bf[1].length) >= len) - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + - be16_to_cpu(bf[1].offset)); - else - dup = NULL; - } - } else { - /* - * Not the same free entry, - * just check its length. - */ - if (be16_to_cpu(dup->length) < len) { - dup = NULL; - } - } - } - compact = 0; - } + xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup, + &enddup, &compact, len); + /* - * If there are stale entries we'll use one for the leaf. - * Is the biggest entry enough to avoid compaction? + * Done everything we need for a space check now. */ - else if (be16_to_cpu(bf[0].length) >= len) { - dup = (xfs_dir2_data_unused_t *) - ((char *)hdr + be16_to_cpu(bf[0].offset)); - compact = 0; - } - /* - * Will need to compact to make this work. - */ - else { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); - /* - * If it's not free then the data will go where the - * leaf data starts now, if it works at all. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * - (uint)sizeof(*blp) < len) - dup = NULL; - } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) - dup = NULL; - else - dup = (xfs_dir2_data_unused_t *)blp; - compact = 1; - } - /* - * If this isn't a real add, we're done with the buffer. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { xfs_trans_brelse(tp, bp); + if (!dup) + return XFS_ERROR(ENOSPC); + return 0; + } + /* * If we don't have space for the new entry & leaf ... */ if (!dup) { - /* - * Not trying to actually do anything, or don't have - * a space reservation: return no-space. - */ - if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + /* Don't have a space reservation: return no-space. */ + if (args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to the next larger format. @@ -232,65 +313,24 @@ xfs_dir2_block_addname( return error; return xfs_dir2_leaf_addname(args); } - /* - * Just checking, and it would work, so say so. - */ - if (args->op_flags & XFS_DA_OP_JUSTCHECK) - return 0; + needlog = needscan = 0; + /* * If need to compact the leaf entries, do it now. - * Leave the highest-numbered stale entry stale. - * XXX should be the one closest to mid but mid is not yet computed. - */ - if (compact) { - int fromidx; /* source leaf index */ - int toidx; /* target leaf index */ - - for (fromidx = toidx = be32_to_cpu(btp->count) - 1, - highstale = lfloghigh = -1; - fromidx >= 0; - fromidx--) { - if (blp[fromidx].address == - cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { - if (highstale == -1) - highstale = toidx; - else { - if (lfloghigh == -1) - lfloghigh = toidx; - continue; - } - } - if (fromidx < toidx) - blp[toidx] = blp[fromidx]; - toidx--; - } - lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); - lfloghigh -= be32_to_cpu(btp->stale) - 1; - be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), - (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), - &needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; - btp->stale = cpu_to_be32(1); - /* - * If we now need to rebuild the bestfree map, do so. - * This needs to happen before the next call to use_free. - */ - if (needscan) { - xfs_dir2_data_freescan(mp, hdr, &needlog); - needscan = 0; - } - } - /* - * Set leaf logging boundaries to impossible state. - * For the no-stale case they're set explicitly. */ + if (compact) + xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, + &lfloghigh, &lfloglow); else if (btp->stale) { + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ lfloglow = be32_to_cpu(btp->count); lfloghigh = -1; } + /* * Find the slot that's first lower than our hash value, -1 if none. */ @@ -450,18 +490,13 @@ xfs_dir2_block_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { + if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) return 0; - } - /* - * Can't read the block, give up, else get dabuf in bp. - */ - error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1, - &bp, XFS_DATA_FORK, NULL); + + error = xfs_dir2_block_read(NULL, dp, &bp); if (error) return error; - ASSERT(bp != NULL); /* * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. @@ -637,14 +672,11 @@ xfs_dir2_block_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the buffer, return error if we can't get it. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_block_read(tp, dp, &bp); if (error) return error; - ASSERT(bp != NULL); + hdr = bp->b_addr; xfs_dir2_data_check(dp, bp); btp = xfs_dir2_block_tail_p(mp, hdr); From 82025d7f79148fe66a1594a0ebe4ab38152cf9e6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:12 +1100 Subject: [PATCH 60/78] xfs: verify dir2 block format buffers Add a dir2 block format read verifier. To fully verify every block when read, call xfs_dir2_data_check() on them. Change xfs_dir2_data_check() to do runtime checking, convert ASSERT() checks to XFS_WANT_CORRUPTED_RETURN(), which will trigger an ASSERT failure on debug kernels, but on production kernels will dump an error to dmesg and return EFSCORRUPTED to the caller. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 22 ++++++++++++- fs/xfs/xfs_dir2_data.c | 73 +++++++++++++++++++++++++---------------- fs/xfs/xfs_dir2_priv.h | 4 ++- 3 files changed, 68 insertions(+), 31 deletions(-) diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 25ce409487be..57351b868861 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -56,6 +56,26 @@ xfs_dir_startup(void) xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); } +static void +xfs_dir2_block_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + static int xfs_dir2_block_read( struct xfs_trans *tp, @@ -65,7 +85,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, NULL); + XFS_DATA_FORK, xfs_dir2_block_verify); } static void diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 44ffd4d6bc91..cb117234e32e 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -34,14 +34,13 @@ STATIC xfs_dir2_data_free_t * xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); -#ifdef DEBUG /* * Check the consistency of the data block. * The input can also be a block-format directory. - * Pop an assert if we find anything bad. + * Return 0 is the buffer is good, otherwise an error. */ -void -xfs_dir2_data_check( +int +__xfs_dir2_data_check( struct xfs_inode *dp, /* incore inode pointer */ struct xfs_buf *bp) /* data block's buffer */ { @@ -64,18 +63,23 @@ xfs_dir2_data_check( int stale; /* count of stale leaves */ struct xfs_name name; - mp = dp->i_mount; + mp = bp->b_target->bt_mount; hdr = bp->b_addr; bf = hdr->bestfree; p = (char *)(hdr + 1); - if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): btp = xfs_dir2_block_tail_p(mp, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; - } else { - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): endp = (char *)hdr + mp->m_dirblksize; + break; + default: + XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); + return EFSCORRUPTED; } count = lastfree = freeseen = 0; @@ -83,19 +87,22 @@ xfs_dir2_data_check( * Account for zero bestfree entries. */ if (!bf[0].length) { - ASSERT(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - ASSERT(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - ASSERT(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); freeseen |= 1 << 2; } - ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); + + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + be16_to_cpu(bf[1].length)); + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. */ @@ -107,17 +114,20 @@ xfs_dir2_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT(lastfree == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == - (char *)dup - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(lastfree == 0); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, dup); if (dfp) { i = (int)(dfp - bf); - ASSERT((freeseen & (1 << i)) == 0); + XFS_WANT_CORRUPTED_RETURN( + (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - ASSERT(be16_to_cpu(dup->length) <= - be16_to_cpu(bf[2].length)); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); } p += be16_to_cpu(dup->length); lastfree = 1; @@ -130,10 +140,12 @@ xfs_dir2_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - ASSERT(dep->namelen != 0); - ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); - ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == - (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN( + !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == + (char *)dep - (char *)hdr); count++; lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { @@ -148,27 +160,30 @@ xfs_dir2_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - ASSERT(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } p += xfs_dir2_data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - ASSERT(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { if (lep[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); + XFS_WANT_CORRUPTED_RETURN( + be32_to_cpu(lep[i].hashval) >= + be32_to_cpu(lep[i - 1].hashval)); } - ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - ASSERT(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(count == + be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); } + return 0; } -#endif /* * Given a data block and an unused entry from that block, diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 3523d3e15aa8..93b8f66ae788 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -41,10 +41,12 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, /* xfs_dir2_data.c */ #ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp); #else #define xfs_dir2_data_check(dp,bp) #endif +extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); + extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup, int *loghead); From 2025207ca6738a1217126ef14af9d104433f9824 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:13 +1100 Subject: [PATCH 61/78] xfs: factor dir2 free block reading Also factor out the updating of the free block when removing entries from leaf blocks, and add a verifier callback for reads. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_leaf.c | 3 +- fs/xfs/xfs_dir2_node.c | 218 ++++++++++++++++++++++++++--------------- fs/xfs/xfs_dir2_priv.h | 2 + 3 files changed, 143 insertions(+), 80 deletions(-) diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 86e3dc1de0e7..6c1359dc9898 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1863,8 +1863,7 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp); if (error) return error; free = fbp->b_addr; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 290c2b1016ab..d7f899dfbff5 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -55,6 +55,57 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); +static void +xfs_dir2_free_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic", + XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static int +__xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_free_verify); +} + +int +xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp); +} + +static int +xfs_dir2_free_try_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp); +} + /* * Log entries from a freespace block. */ @@ -394,12 +445,10 @@ xfs_dir2_leafn_lookup_for_addname( */ if (curbp) xfs_trans_brelse(tp, curbp); - /* - * Read the free block. - */ - error = xfs_da_read_buf(tp, dp, + + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, newfdb), - -1, &curbp, XFS_DATA_FORK, NULL); + &curbp); if (error) return error; free = curbp->b_addr; @@ -825,6 +874,77 @@ xfs_dir2_leafn_rebalance( } } +static int +xfs_dir2_data_block_free( + xfs_da_args_t *args, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_free *free, + xfs_dir2_db_t fdb, + int findex, + struct xfs_buf *fbp, + int longest) +{ + struct xfs_trans *tp = args->trans; + int logfree = 0; + + if (!hdr) { + /* One less used entry in the free table. */ + be32_add_cpu(&free->hdr.nused, -1); + xfs_dir2_free_log_header(tp, fbp); + + /* + * If this was the last entry in the table, we can trim the + * table size back. There might be other entries at the end + * referring to non-existent data blocks, get those too. + */ + if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (free->bests[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + free->hdr.nvalid = cpu_to_be32(i + 1); + logfree = 0; + } else { + /* Not the last entry, just punch it out. */ + free->bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + /* + * If there are no useful entries left in the block, + * get rid of the block if we can. + */ + if (!free->hdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + } else { + /* + * Data block is not empty, just set the free entry to the new + * value. + */ + free->bests[findex] = cpu_to_be16(longest); + logfree = 1; + } + + /* Log the free entry that changed, unless we got rid of it. */ + if (logfree) + xfs_dir2_free_log_bests(tp, fbp, findex, findex); + return 0; +} + /* * Remove an entry from a node directory. * This removes the leaf entry and the data entry, @@ -908,15 +1028,14 @@ xfs_dir2_leafn_remove( xfs_dir2_db_t fdb; /* freeblock block number */ int findex; /* index in freeblock entries */ xfs_dir2_free_t *free; /* freeblock structure */ - int logfree; /* need to log free entry */ /* * Convert the data block number to a free block, * read in the free block. */ fdb = xfs_dir2_db_to_fdb(mp, db); - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), - -1, &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb), + &fbp); if (error) return error; free = fbp->b_addr; @@ -954,68 +1073,12 @@ xfs_dir2_leafn_remove( * If we got rid of the data block, we can eliminate that entry * in the free block. */ - if (hdr == NULL) { - /* - * One less used entry in the free table. - */ - be32_add_cpu(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); - /* - * If this was the last entry in the table, we can - * trim the table size back. There might be other - * entries at the end referring to non-existent - * data blocks, get those too. - */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ - - for (i = findex - 1; - i >= 0 && - free->bests[i] == cpu_to_be16(NULLDATAOFF); - i--) - continue; - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } - /* - * Not the last entry, just punch it out. - */ - else { - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ - } - } - /* - * Data block is not empty, just set the free entry to - * the new value. - */ - else { - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; - } - /* - * Log the free entry that changed, unless we got rid of it. - */ - if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); + error = xfs_dir2_data_block_free(args, hdr, free, + fdb, findex, fbp, longest); + if (error) + return error; } + xfs_dir2_leafn_check(dp, bp); /* * Return indication of whether this leaf block is empty enough @@ -1453,9 +1516,9 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), -2, - &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); if (error) return error; if (!fbp) @@ -1518,8 +1581,9 @@ xfs_dir2_node_addname_int( * that was just allocated. */ fbno = xfs_dir2_db_to_fdb(mp, dbno); - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2, - &fbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + &fbp); if (error) return error; @@ -1915,17 +1979,15 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_free_try_read(tp, dp, fo, &bp); if (error) return error; /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. */ - if (bp == NULL) { + if (!bp) return 0; - } free = bp->b_addr; ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); /* diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 93b8f66ae788..263a63287910 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -117,6 +117,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args); extern int xfs_dir2_node_replace(struct xfs_da_args *args); extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, int *rvalp); +extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); /* xfs_dir2_sf.c */ extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); From e4813572640e27d3a5cce3f06751a9f54f77aaa5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:14 +1100 Subject: [PATCH 62/78] xfs: factor out dir2 data block reading And add a verifier callback function while there. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_block.c | 3 +-- fs/xfs/xfs_dir2_data.c | 32 ++++++++++++++++++++++++++++++++ fs/xfs/xfs_dir2_leaf.c | 38 +++++++++++++++++--------------------- fs/xfs/xfs_dir2_node.c | 8 ++++---- fs/xfs/xfs_dir2_priv.h | 2 ++ 5 files changed, 56 insertions(+), 27 deletions(-) diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 57351b868861..ca03b109772d 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -970,8 +970,7 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); if (error) return error; } diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index cb117234e32e..0ef04f1bf511 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,6 +185,38 @@ __xfs_dir2_data_check( return 0; } +static void +xfs_dir2_data_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC); + block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0; + + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_dir2_data_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, xfs_dir2_data_verify); +} + /* * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 6c1359dc9898..0fdf765c917f 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -493,14 +493,14 @@ xfs_dir2_leaf_addname( hdr = dbp->b_addr; bestsp[use_block] = hdr->bestfree[0].length; grown = 1; - } - /* - * Already had space in some data block. - * Just read that one in. - */ - else { - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), - -1, &dbp, XFS_DATA_FORK, NULL); + } else { + /* + * Already had space in some data block. + * Just read that one in. + */ + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, use_block), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -508,7 +508,6 @@ xfs_dir2_leaf_addname( hdr = dbp->b_addr; grown = 0; } - xfs_dir2_data_check(dp, dbp); /* * Point to the biggest freespace in our data block. */ @@ -891,10 +890,9 @@ xfs_dir2_leaf_readbuf( * Read the directory block starting at the first mapping. */ mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); - error = xfs_da_read_buf(NULL, dp, map->br_startoff, + error = xfs_dir2_data_read(NULL, dp, map->br_startoff, map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, - &bp, XFS_DATA_FORK, NULL); + XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); /* * Should just skip over the data block instead of giving up. @@ -1408,14 +1406,13 @@ xfs_dir2_leaf_lookup_int( if (newdb != curdb) { if (dbp) xfs_trans_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, newdb), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, newdb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; } - xfs_dir2_data_check(dp, dbp); curdb = newdb; } /* @@ -1450,9 +1447,9 @@ xfs_dir2_leaf_lookup_int( ASSERT(cidb != -1); if (cidb != curdb) { xfs_trans_brelse(tp, dbp); - error = xfs_da_read_buf(tp, dp, - xfs_dir2_db_to_da(mp, cidb), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, + xfs_dir2_db_to_da(mp, cidb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1737,8 +1734,7 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, - XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); if (error) return error; diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index d7f899dfbff5..67b811c17eaa 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -583,9 +583,9 @@ xfs_dir2_leafn_lookup_for_entry( ASSERT(state->extravalid); curbp = state->extrablk.bp; } else { - error = xfs_da_read_buf(tp, dp, + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, newdb), - -1, &curbp, XFS_DATA_FORK, NULL); + -1, &curbp); if (error) return error; } @@ -1692,8 +1692,8 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), - -1, &dbp, XFS_DATA_FORK, NULL); + error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + -1, &dbp); if (error) return error; hdr = dbp->b_addr; diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 263a63287910..71ec82839107 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -46,6 +46,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #define xfs_dir2_data_check(dp,bp) #endif extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, From e6f7667c4eef42b6f5bc6cdeb31d0bab62fe5f79 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:15 +1100 Subject: [PATCH 63/78] xfs: factor dir2 leaf read Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_dir2_leaf.c | 73 +++++++++++++++++++++++++++++++++++------- fs/xfs/xfs_dir2_node.c | 6 ++-- fs/xfs/xfs_dir2_priv.h | 2 ++ 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 0fdf765c917f..97408e3287ed 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -48,6 +48,62 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, int first, int last); static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void +xfs_dir2_leaf_verify( + struct xfs_buf *bp, + __be16 magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == magic; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static void +xfs_dir2_leaf1_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} + +static void +xfs_dir2_leafn_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); +} + +static int +xfs_dir2_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_leaf1_verify); +} + +int +xfs_dir2_leafn_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, xfs_dir2_leafn_verify); +} /* * Convert a block form directory to a leaf form directory. @@ -311,14 +367,11 @@ xfs_dir2_leaf_addname( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; - ASSERT(lbp != NULL); + /* * Look up the entry by hash value and name. * We know it's not there, our caller has already done a lookup. @@ -1369,13 +1422,11 @@ xfs_dir2_leaf_lookup_int( dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block into the buffer. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK, NULL); + + error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); if (error) return error; + *lbpp = lbp; leaf = lbp->b_addr; xfs_dir2_leaf_check(dp, lbp); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 67b811c17eaa..7c6f95697e28 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1232,11 +1232,11 @@ xfs_dir2_leafn_toosmall( /* * Read the sibling leaf block. */ - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_DATA_FORK, NULL); + error = xfs_dir2_leafn_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return error; - ASSERT(bp != NULL); + /* * Count bytes in the two blocks combined. */ diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 71ec82839107..4560825d099c 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -70,6 +70,8 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ +extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_buf *dbp); extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); From ad14c33ac862601c4c22755ed3b59f1906b134e5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:16 +1100 Subject: [PATCH 64/78] xfs: factor and verify attr leaf reads Some reads are not converted yet because it isn't obvious ahead of time what the format of the block is going to be. Need to determine how to tell if the first block in the tree is a node or leaf format block. That will be done in later patches. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 70 ++++++++++--------------------------- fs/xfs/xfs_attr_leaf.c | 78 ++++++++++++++++++++++++------------------ fs/xfs/xfs_attr_leaf.h | 3 ++ 3 files changed, 66 insertions(+), 85 deletions(-) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index cd5a9cd0ded0..d644915367e3 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; /* * Look up the given attribute in the leaf block. Figure out if @@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, + -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); - (void)xfs_attr_leaf_remove(bp, args); + return error; + + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; - ASSERT(bp != NULL); error = xfs_attr_leaf_lookup_int(bp, args); if (error == ENOATTR) { xfs_trans_brelse(args->trans, bp); return(error); } - (void)xfs_attr_leaf_remove(bp, args); + xfs_attr_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. @@ -1158,11 +1153,9 @@ xfs_attr_leaf_get(xfs_da_args_t *args) trace_xfs_attr_leaf_get(args); args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; error = xfs_attr_leaf_lookup_int(bp, args); if (error != EEXIST) { @@ -1183,25 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args) STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context) { - xfs_attr_leafblock_t *leaf; int error; struct xfs_buf *bp; trace_xfs_attr_leaf_list(context); context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK, - NULL); + error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp); if (error) return XFS_ERROR(error); - ASSERT(bp != NULL); - leaf = bp->b_addr; - if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_trans_brelse(NULL, bp); - return XFS_ERROR(EFSCORRUPTED); - } error = xfs_attr_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); @@ -1605,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args) ASSERT(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; - ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) == - cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { xfs_bmap_init(args->flist, args->firstblock); @@ -1920,14 +1900,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - if (unlikely(leaf->hdr.info.magic != - cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_trans_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } error = xfs_attr_leaf_list_int(bp, context); if (error) { xfs_trans_brelse(NULL, bp); @@ -1937,16 +1909,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_trans_brelse(NULL, bp); - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1, + &bp); if (error) - return(error); - if (unlikely((bp == NULL))) { - XFS_ERROR_REPORT("xfs_attr_node_list(5)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } + return error; } xfs_trans_brelse(NULL, bp); return(0); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index ba2b9a2cd236..357971536d50 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,6 +88,36 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +static void +xfs_attr_leaf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leaf_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC); + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_attr_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, xfs_attr_leaf_verify); +} + /*======================================================================== * Namespace helper routines *========================================================================*/ @@ -870,11 +900,10 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1); if (error) goto out; - ASSERT(bp1 != NULL); + bp2 = NULL; error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, XFS_ATTR_FORK); @@ -1641,18 +1670,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK, NULL); + error = xfs_attr_leaf_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return(error); - ASSERT(bp != NULL); leaf = (xfs_attr_leafblock_t *)info; count = be16_to_cpu(leaf->hdr.count); bytes = state->blocksize - (state->blocksize>>2); bytes -= be16_to_cpu(leaf->hdr.usedbytes); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); count += be16_to_cpu(leaf->hdr.count); bytes -= be16_to_cpu(leaf->hdr.usedbytes); bytes -= count * sizeof(xfs_attr_leaf_entry_t); @@ -2518,15 +2545,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2583,15 +2606,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); - if (error) { + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); ASSERT(args->index >= 0); entry = &leaf->entries[ args->index ]; @@ -2640,35 +2659,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) /* * Read the block containing the "old" attr */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } - ASSERT(bp1 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + if (error) + return error; /* * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK, NULL); - if (error) { - return(error); - } - ASSERT(bp2 != NULL); + error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2, + -1, &bp2); + if (error) + return error; } else { bp2 = bp1; } leaf1 = bp1->b_addr; - ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); ASSERT(args->index >= 0); entry1 = &leaf1->entries[ args->index ]; leaf2 = bp2->b_addr; - ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); ASSERT(args->index2 >= 0); entry2 = &leaf2->entries[ args->index2 ]; diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index dea17722945e..8f7ab986f45d 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -227,6 +227,9 @@ int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); int xfs_attr_leaf_flipflags(xfs_da_args_t *args); +int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); /* * Routines used for growing the Btree. From d9392a4bb75503fc2adbb5237c3df940c6467eb2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:17 +1100 Subject: [PATCH 65/78] xfs: add xfs_da_node verification Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_attr.c | 22 +++------ fs/xfs/xfs_attr_leaf.c | 12 ++--- fs/xfs/xfs_attr_leaf.h | 8 +-- fs/xfs/xfs_da_btree.c | 109 +++++++++++++++++++++++++++++++---------- fs/xfs/xfs_da_btree.h | 3 ++ fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_dir2_priv.h | 1 + 7 files changed, 107 insertions(+), 50 deletions(-) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index d644915367e3..aaf472532b3c 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1696,10 +1696,10 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK, NULL); + &blk->bp, XFS_ATTR_FORK); if (error) return(error); } else { @@ -1715,10 +1715,10 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK, NULL); + &blk->bp, XFS_ATTR_FORK); if (error) return(error); } else { @@ -1807,8 +1807,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ bp = NULL; if (cursor->blkno > 0) { - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); if ((error != 0) && (error != EFSCORRUPTED)) return(error); if (bp) { @@ -1849,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) if (bp == NULL) { cursor->blkno = 0; for (;;) { - error = xfs_da_read_buf(NULL, context->dp, + error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1, &bp, - XFS_ATTR_FORK, NULL); + XFS_ATTR_FORK); if (error) return(error); - if (unlikely(bp == NULL)) { - XFS_ERROR_REPORT("xfs_attr_node_list(2)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } node = bp->b_addr; if (node->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 357971536d50..efe170da2881 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,7 +88,7 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -static void +void xfs_attr_leaf_verify( struct xfs_buf *bp) { @@ -2765,7 +2765,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * the extents in reverse order the extent containing * block 0 must still be there. */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) return(error); blkno = XFS_BUF_ADDR(bp); @@ -2850,8 +2850,8 @@ xfs_attr_node_inactive( * traversal of the tree so we may deal with many blocks * before we come back to this one. */ - error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); if (error) return(error); if (child_bp) { @@ -2891,8 +2891,8 @@ xfs_attr_node_inactive( * child block number. */ if ((i+1) < count) { - error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK, NULL); + error = xfs_da_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); if (error) return(error); child_fsb = be32_to_cpu(node->btree[i+1].before); diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 8f7ab986f45d..098e9a58ad9f 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -227,9 +227,6 @@ int xfs_attr_leaf_to_shortform(struct xfs_buf *bp, int xfs_attr_leaf_clearflag(struct xfs_da_args *args); int xfs_attr_leaf_setflag(struct xfs_da_args *args); int xfs_attr_leaf_flipflags(xfs_da_args_t *args); -int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mappedbno, - struct xfs_buf **bpp); /* * Routines used for growing the Btree. @@ -264,4 +261,9 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local); +int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); +void xfs_attr_leaf_verify(struct xfs_buf *bp); + #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index f9e9149de009..1b84fc50a053 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -91,6 +91,68 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *save_blk); STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); +static void +__xfs_da_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_node_hdr *hdr = bp->b_addr; + int block_ok = 0; + + block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC); + block_ok = block_ok && + be16_to_cpu(hdr->level) > 0 && + be16_to_cpu(hdr->count) > 0 ; + if (!block_ok) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +static void +xfs_da_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + __xfs_da_node_verify(bp); + return; + case XFS_ATTR_LEAF_MAGIC: + xfs_attr_leaf_verify(bp); + return; + case XFS_DIR2_LEAFN_MAGIC: + xfs_dir2_leafn_verify(bp); + return; + default: + break; + } + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, info); + xfs_buf_ioerror(bp, EFSCORRUPTED); + + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} + +int +xfs_da_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int which_fork) +{ + return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, xfs_da_node_verify); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -746,8 +808,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) */ child = be32_to_cpu(oldroot->btree[0].before); ASSERT(child != 0); - error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, - args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp, + args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -837,9 +899,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) blkno = be32_to_cpu(info->back); if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, state->args->whichfork, - NULL); + error = xfs_da_node_read(state->args->trans, state->args->dp, + blkno, -1, &bp, state->args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1084,8 +1145,8 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, - -1, &blk->bp, args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, blkno, + -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; state->path.active--; @@ -1246,9 +1307,9 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->back), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1267,9 +1328,9 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(old_info->forw), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1367,9 +1428,9 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1384,9 +1445,9 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), - -1, &bp, args->whichfork, NULL); + -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); @@ -1470,8 +1531,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork, NULL); + error = xfs_da_node_read(args->trans, args->dp, blkno, -1, + &blk->bp, args->whichfork); if (error) return(error); ASSERT(blk->bp != NULL); @@ -1734,7 +1795,7 @@ xfs_da_swap_lastblock( * Read the last block in the btree space. */ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w, NULL); + error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w); if (error) return error; /* @@ -1761,8 +1822,7 @@ xfs_da_swap_lastblock( * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1784,8 +1844,7 @@ xfs_da_swap_lastblock( * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w); if (error) goto done; sib_info = sib_buf->b_addr; @@ -1809,8 +1868,7 @@ xfs_da_swap_lastblock( * Walk down the tree looking for the parent of the moved block. */ for (;;) { - error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; @@ -1861,8 +1919,7 @@ xfs_da_swap_lastblock( error = XFS_ERROR(EFSCORRUPTED); goto done; } - error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w, - NULL); + error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w); if (error) goto done; par_node = par_buf->b_addr; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index bf8bfaa0d356..2d1bec4b7595 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -213,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, */ int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); +int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int which_fork); /* * Utility routines. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 97408e3287ed..67cc21c2a45d 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -74,7 +74,7 @@ xfs_dir2_leaf1_verify( xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); } -static void +void xfs_dir2_leafn_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 4560825d099c..e0b96e7693ea 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -70,6 +70,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ +extern void xfs_dir2_leafn_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, From da6958c873ecd846d71fafbfe0f6168bb9c2c99e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:18 +1100 Subject: [PATCH 66/78] xfs: Add verifiers to dir2 data readahead. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_da_btree.c | 4 ++-- fs/xfs/xfs_da_btree.h | 4 ++-- fs/xfs/xfs_dir2_data.c | 13 ++++++++++++- fs/xfs/xfs_dir2_leaf.c | 11 +++++------ fs/xfs/xfs_dir2_priv.h | 2 ++ fs/xfs/xfs_file.c | 4 +++- 6 files changed, 26 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 1b84fc50a053..93ebc0fc6dd9 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2285,10 +2285,10 @@ xfs_da_reada_buf( struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mappedbno, int whichfork, xfs_buf_iodone_t verifier) { - xfs_daddr_t mappedbno = -1; struct xfs_buf_map map; struct xfs_buf_map *mapp; int nmap; @@ -2296,7 +2296,7 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, + error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 2d1bec4b7595..521b008445ab 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -231,8 +231,8 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, struct xfs_buf **bpp, int whichfork, xfs_buf_iodone_t verifier); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork, - xfs_buf_iodone_t verifier); + xfs_dablk_t bno, xfs_daddr_t mapped_bno, + int whichfork, xfs_buf_iodone_t verifier); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 0ef04f1bf511..1a43c8593c00 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,7 +185,7 @@ __xfs_dir2_data_check( return 0; } -static void +void xfs_dir2_data_verify( struct xfs_buf *bp) { @@ -217,6 +217,17 @@ xfs_dir2_data_read( XFS_DATA_FORK, xfs_dir2_data_verify); } +int +xfs_dir2_data_readahead( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno) +{ + return xfs_da_reada_buf(tp, dp, bno, mapped_bno, + XFS_DATA_FORK, xfs_dir2_data_verify); +} + /* * Given a data block and an unused entry from that block, * return the bestfree entry if any that corresponds to it. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 67cc21c2a45d..8a95547d42ac 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -972,11 +972,11 @@ xfs_dir2_leaf_readbuf( */ if (i > mip->ra_current && map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_buf_readahead(mp->m_ddev_targp, + xfs_dir2_data_readahead(NULL, dp, + map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(mp, map[mip->ra_index].br_startblock + - mip->ra_offset), - (int)BTOBB(mp->m_dirblksize), NULL); + mip->ra_offset)); mip->ra_current = i; } @@ -985,10 +985,9 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_da_reada_buf(NULL, dp, + xfs_dir2_data_readahead(NULL, dp, map[mip->ra_index].br_startoff + - mip->ra_offset, - XFS_DATA_FORK, NULL); + mip->ra_offset, -1); mip->ra_current = i; } diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index e0b96e7693ea..daf5d0fc6165 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -48,6 +48,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); +extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f6dab7da7bcc..400b187595bb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -31,6 +31,8 @@ #include "xfs_error.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" +#include "xfs_dir2_format.h" +#include "xfs_dir2_priv.h" #include "xfs_ioctl.h" #include "xfs_trace.h" @@ -891,7 +893,7 @@ xfs_dir_open( */ mode = xfs_ilock_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK, NULL); + xfs_dir2_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); return 0; } From cfb02852226aa449fe27075caffe88726507668c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 12 Nov 2012 22:54:19 +1100 Subject: [PATCH 67/78] xfs: add buffer pre-write callback Add a callback to the buffer write path to enable verification of the buffer and CRC calculation prior to issuing the write to the underlying storage. If the callback function detects some kind of failure or error condition, it must mark the buffer with an error so that the caller can take appropriate action. In the case of xfs_buf_ioapply(), a corrupt metadta buffer willt rigger a shutdown of the filesystem, because something is clearly wrong and we can't allow corrupt metadata to be written to disk. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 16 ++++++++++++++++ fs/xfs/xfs_buf.h | 3 +++ 2 files changed, 19 insertions(+) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index fbc965fc075a..bd1a948ee39c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -569,7 +569,9 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_pre_io = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -1323,6 +1325,20 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ rw |= REQ_META; + /* + * run the pre-io callback function if it exists. If this function + * fails it will mark the buffer with an error and the IO should + * not be dispatched. + */ + if (bp->b_pre_io) { + bp->b_pre_io(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } + /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 677b1dc822f4..51bc16a1cd9c 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -155,6 +155,9 @@ typedef struct xfs_buf { unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ + void (*b_pre_io)(struct xfs_buf *); + /* pre-io callback function */ + #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif From 612cfbfe174a89d565363fff7f3961a2dda5fb71 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:52:32 +1100 Subject: [PATCH 68/78] xfs: add pre-write metadata buffer verifier callbacks These verifiers are essentially the same code as the read verifiers, but do not require ioend processing. Hence factor the read verifier functions and add a new write verifier wrapper that is used as the callback. This is done as one large patch for all verifiers rather than one patch per verifier as the change is largely mechanical. This includes hooking up the write verifier via the read verifier function. Hooking up the write verifier for buffers obtained via xfs_trans_get_buf() will be done in a separate patch as that touches code in many different places rather than just the verifier functions. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 35 ++++++++++++++++++++++++++++++++--- fs/xfs/xfs_alloc_btree.c | 21 +++++++++++++++++---- fs/xfs/xfs_attr_leaf.c | 19 +++++++++++++++++-- fs/xfs/xfs_attr_leaf.h | 2 +- fs/xfs/xfs_bmap_btree.c | 21 +++++++++++++++++---- fs/xfs/xfs_da_btree.c | 37 +++++++++++++++++++++++++------------ fs/xfs/xfs_dir2_block.c | 16 +++++++++++++++- fs/xfs/xfs_dir2_data.c | 19 +++++++++++++++++-- fs/xfs/xfs_dir2_leaf.c | 31 ++++++++++++++++++++++++------- fs/xfs/xfs_dir2_node.c | 17 ++++++++++++++++- fs/xfs/xfs_dir2_priv.h | 2 +- fs/xfs/xfs_dquot.c | 27 +++++++++++++++++++++------ fs/xfs/xfs_dquot.h | 2 +- fs/xfs/xfs_ialloc.c | 17 ++++++++++++++++- fs/xfs/xfs_ialloc_btree.c | 19 ++++++++++++++++--- fs/xfs/xfs_inode.c | 19 +++++++++++++++++-- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_mount.c | 19 +++++++++++++++++-- fs/xfs/xfs_qm.c | 2 +- 20 files changed, 273 insertions(+), 56 deletions(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 38b4ab8957ff..d12bbedf6fe5 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -430,8 +430,8 @@ xfs_alloc_fixup_trees( return 0; } -void -xfs_agfl_read_verify( +static void +xfs_agfl_verify( struct xfs_buf *bp) { #ifdef WHEN_CRCS_COME_ALONG @@ -463,6 +463,21 @@ xfs_agfl_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } #endif +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); +} + +void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + xfs_agfl_verify(bp); + bp->b_pre_io = xfs_agfl_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -2129,7 +2144,7 @@ xfs_alloc_put_freelist( } static void -xfs_agf_read_verify( +xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -2164,7 +2179,21 @@ xfs_agf_read_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); +} + +void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + xfs_agf_verify(bp); + bp->b_pre_io = xfs_agf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 46961e52e9b8..6e98b22ebde0 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -272,8 +272,8 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -void -xfs_allocbt_read_verify( +static void +xfs_allocbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -323,11 +323,24 @@ xfs_allocbt_read_verify( if (!sblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_allocbt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + +void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); + bp->b_pre_io = xfs_allocbt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index efe170da2881..57729d71ab1a 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -88,7 +88,7 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -void +static void xfs_attr_leaf_verify( struct xfs_buf *bp) { @@ -101,11 +101,26 @@ xfs_attr_leaf_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_attr_leaf_write_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); +} + +void +xfs_attr_leaf_read_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); + bp->b_pre_io = xfs_attr_leaf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + int xfs_attr_leaf_read( struct xfs_trans *tp, @@ -115,7 +130,7 @@ xfs_attr_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, xfs_attr_leaf_verify); + XFS_ATTR_FORK, xfs_attr_leaf_read_verify); } /*======================================================================== diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 098e9a58ad9f..3bbf6277e43c 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -264,6 +264,6 @@ int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr_leaf_verify(struct xfs_buf *bp); +void xfs_attr_leaf_read_verify(struct xfs_buf *bp); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index bddca9b92869..17d7423e7503 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -708,8 +708,8 @@ xfs_bmbt_key_diff( cur->bc_rec.b.br_startoff; } -void -xfs_bmbt_read_verify( +static void +xfs_bmbt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -744,11 +744,24 @@ xfs_bmbt_read_verify( if (!lblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_bmbt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_bmbt_write_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); +} + +void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); + bp->b_pre_io = xfs_bmbt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 93ebc0fc6dd9..6bb0a59eaaee 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -92,7 +92,7 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); static void -__xfs_da_node_verify( +xfs_da_node_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -108,12 +108,17 @@ __xfs_da_node_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } static void -xfs_da_node_verify( +xfs_da_node_write_verify( + struct xfs_buf *bp) +{ + xfs_da_node_verify(bp); +} + +static void +xfs_da_node_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -121,21 +126,22 @@ xfs_da_node_verify( switch (be16_to_cpu(info->magic)) { case XFS_DA_NODE_MAGIC: - __xfs_da_node_verify(bp); - return; + xfs_da_node_verify(bp); + break; case XFS_ATTR_LEAF_MAGIC: - xfs_attr_leaf_verify(bp); + xfs_attr_leaf_read_verify(bp); return; case XFS_DIR2_LEAFN_MAGIC: - xfs_dir2_leafn_verify(bp); + xfs_dir2_leafn_read_verify(bp); return; default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, + mp, info); + xfs_buf_ioerror(bp, EFSCORRUPTED); break; } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, info); - xfs_buf_ioerror(bp, EFSCORRUPTED); - + bp->b_pre_io = xfs_da_node_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -150,7 +156,7 @@ xfs_da_node_read( int which_fork) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, xfs_da_node_verify); + which_fork, xfs_da_node_read_verify); } /*======================================================================== @@ -816,7 +822,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) xfs_da_blkinfo_onlychild_validate(bp->b_addr, be16_to_cpu(oldroot->hdr.level)); + /* + * This could be copying a leaf back into the root block in the case of + * there only being a single leaf block left in the tree. Hence we have + * to update the pre_io pointer as well to match the buffer type change + * that could occur. + */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + root_blk->bp->b_pre_io = bp->b_pre_io; xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index ca03b109772d..0f8793c74fe2 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -71,7 +71,21 @@ xfs_dir2_block_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_dir2_block_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); +} + +void +xfs_dir2_block_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); + bp->b_pre_io = xfs_dir2_block_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -85,7 +99,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, xfs_dir2_block_verify); + XFS_DATA_FORK, xfs_dir2_block_read_verify); } static void diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 1a43c8593c00..b555585f5ab6 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -200,11 +200,26 @@ xfs_dir2_data_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_dir2_data_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); +} + +void +xfs_dir2_data_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); + bp->b_pre_io = xfs_dir2_data_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + int xfs_dir2_data_read( struct xfs_trans *tp, @@ -214,7 +229,7 @@ xfs_dir2_data_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, xfs_dir2_data_verify); + XFS_DATA_FORK, xfs_dir2_data_read_verify); } int @@ -225,7 +240,7 @@ xfs_dir2_data_readahead( xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, xfs_dir2_data_verify); + XFS_DATA_FORK, xfs_dir2_data_read_verify); } /* diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 8a95547d42ac..5b3bcab2a656 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -62,23 +62,40 @@ xfs_dir2_leaf_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_dir2_leaf1_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); +} + +static void +xfs_dir2_leaf1_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + bp->b_pre_io = xfs_dir2_leaf1_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } static void -xfs_dir2_leaf1_verify( - struct xfs_buf *bp) +xfs_dir2_leafn_write_verify( + struct xfs_buf *bp) { - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); } void -xfs_dir2_leafn_verify( - struct xfs_buf *bp) +xfs_dir2_leafn_read_verify( + struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); + bp->b_pre_io = xfs_dir2_leafn_write_verify; + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); } static int @@ -90,7 +107,7 @@ xfs_dir2_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leaf1_verify); + XFS_DATA_FORK, xfs_dir2_leaf1_read_verify); } int @@ -102,7 +119,7 @@ xfs_dir2_leafn_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leafn_verify); + XFS_DATA_FORK, xfs_dir2_leafn_read_verify); } /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 7c6f95697e28..a58abe1fc0d0 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -69,11 +69,26 @@ xfs_dir2_free_verify( XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_dir2_free_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); +} + +void +xfs_dir2_free_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); + bp->b_pre_io = xfs_dir2_free_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } + static int __xfs_dir2_free_read( struct xfs_trans *tp, @@ -83,7 +98,7 @@ __xfs_dir2_free_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_free_verify); + XFS_DATA_FORK, xfs_dir2_free_read_verify); } int diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index daf5d0fc6165..7ec61af8449f 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -72,7 +72,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern void xfs_dir2_leafn_verify(struct xfs_buf *bp); +extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 0ba0f0992d6e..b38a10e6f2e0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -360,8 +360,8 @@ xfs_qm_dqalloc( return (error); } -void -xfs_dquot_read_verify( +static void +xfs_dquot_buf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -388,12 +388,26 @@ xfs_dquot_read_verify( error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, "xfs_dquot_read_verify"); if (error) { - XFS_CORRUPTION_ERROR("xfs_dquot_read_verify", - XFS_ERRLEVEL_LOW, mp, d); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); xfs_buf_ioerror(bp, EFSCORRUPTED); break; } } +} + +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} + +void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); + bp->b_pre_io = xfs_dquot_buf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -413,7 +427,7 @@ xfs_qm_dqrepair( /* * Read the buffer without verification so we get the corrupted - * buffer returned to us. + * buffer returned to us. make sure we verify it on write, though. */ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, @@ -423,6 +437,7 @@ xfs_qm_dqrepair( ASSERT(*bpp == NULL); return XFS_ERROR(error); } + (*bpp)->b_pre_io = xfs_dquot_buf_write_verify; ASSERT(xfs_buf_islocked(*bpp)); d = (struct xfs_dqblk *)(*bpp)->b_addr; @@ -521,7 +536,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, xfs_dquot_read_verify); + 0, &bp, xfs_dquot_buf_read_verify); if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index a08ba92d7da0..5438d883b628 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,7 +140,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); -extern void xfs_dquot_read_verify(struct xfs_buf *bp); +extern void xfs_dquot_buf_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5bd255e5f7b8..070f41845572 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1473,7 +1473,7 @@ xfs_check_agi_unlinked( #endif static void -xfs_agi_read_verify( +xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -1502,6 +1502,21 @@ xfs_agi_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } xfs_check_agi_unlinked(agi); +} + +static void +xfs_agi_write_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); +} + +void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + xfs_agi_verify(bp); + bp->b_pre_io = xfs_agi_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 11306c6d61c7..15a79f8ca03c 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -183,7 +183,7 @@ xfs_inobt_key_diff( } void -xfs_inobt_read_verify( +xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -211,11 +211,24 @@ xfs_inobt_read_verify( if (!sblock_ok) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_inobt_read_verify", - XFS_ERRLEVEL_LOW, mp, block); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block); xfs_buf_ioerror(bp, EFSCORRUPTED); } +} +static void +xfs_inobt_write_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); +} + +void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); + bp->b_pre_io = xfs_inobt_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3a243d076950..910b2da01042 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -382,7 +382,7 @@ xfs_inobp_check( } #endif -void +static void xfs_inode_buf_verify( struct xfs_buf *bp) { @@ -418,6 +418,21 @@ xfs_inode_buf_verify( } } xfs_inobp_check(mp, bp); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); +} + +void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); + bp->b_pre_io = xfs_inode_buf_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -447,7 +462,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp, - xfs_inode_buf_verify); + xfs_inode_buf_read_verify); if (error) { if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1a892114792f..a322c19723a3 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,7 +554,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); -void xfs_inode_buf_verify(struct xfs_buf *); +void xfs_inode_buf_read_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 0f18d412e3e8..7f86fdaab7ae 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -397,7 +397,7 @@ xfs_bulkstat( & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster, - xfs_inode_buf_verify); + xfs_inode_buf_read_verify); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bff18d73c610..c85da75e4a43 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -612,8 +612,8 @@ xfs_sb_to_disk( } } -void -xfs_sb_read_verify( +static void +xfs_sb_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; @@ -629,6 +629,21 @@ xfs_sb_read_verify( error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); if (error) xfs_buf_ioerror(bp, error); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); + bp->b_pre_io = xfs_sb_write_verify; bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a6dfb97490cc..bd40ae9624e5 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -893,7 +893,7 @@ xfs_qm_dqiter_bufs( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, - xfs_dquot_read_verify); + xfs_dquot_buf_read_verify); if (error) break; From b0f539de9fcc543a3ffa40bc22bf51aca6ea6183 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:53:49 +1100 Subject: [PATCH 69/78] xfs: connect up write verifiers to new buffers Metadata buffers that are read from disk have write verifiers already attached to them, but newly allocated buffers do not. Add appropriate write verifiers to all new metadata buffers. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 8 +-- fs/xfs/xfs_alloc.h | 3 ++ fs/xfs/xfs_alloc_btree.c | 1 + fs/xfs/xfs_attr_leaf.c | 4 +- fs/xfs/xfs_bmap.c | 2 + fs/xfs/xfs_bmap_btree.c | 3 +- fs/xfs/xfs_bmap_btree.h | 1 + fs/xfs/xfs_btree.c | 1 + fs/xfs/xfs_btree.h | 2 + fs/xfs/xfs_da_btree.c | 3 ++ fs/xfs/xfs_dir2_block.c | 2 + fs/xfs/xfs_dir2_data.c | 11 ++-- fs/xfs/xfs_dir2_leaf.c | 19 ++++--- fs/xfs/xfs_dir2_node.c | 24 +++++---- fs/xfs/xfs_dir2_priv.h | 2 + fs/xfs/xfs_dquot.c | 104 +++++++++++++++++++------------------- fs/xfs/xfs_fsops.c | 8 ++- fs/xfs/xfs_ialloc.c | 5 +- fs/xfs/xfs_ialloc.h | 4 +- fs/xfs/xfs_ialloc_btree.c | 1 + fs/xfs/xfs_inode.c | 14 ++++- fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 1 + 24 files changed, 137 insertions(+), 89 deletions(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index d12bbedf6fe5..545a6c4c2366 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -465,14 +465,14 @@ xfs_agfl_verify( #endif } -static void +void xfs_agfl_write_verify( struct xfs_buf *bp) { xfs_agfl_verify(bp); } -void +static void xfs_agfl_read_verify( struct xfs_buf *bp) { @@ -2181,14 +2181,14 @@ xfs_agf_verify( } } -static void +void xfs_agf_write_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); } -void +static void xfs_agf_read_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index feacb061bab7..f32811f50f43 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,4 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +void xfs_agf_write_verify(struct xfs_buf *bp); +void xfs_agfl_write_verify(struct xfs_buf *bp); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 6e98b22ebde0..b83396524913 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -401,6 +401,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, .read_verify = xfs_allocbt_read_verify, + .write_verify = xfs_allocbt_write_verify, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 57729d71ab1a..5cd5b0c1d17a 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -924,7 +924,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) XFS_ATTR_FORK); if (error) goto out; - ASSERT(bp2 != NULL); + bp2->b_pre_io = bp1->b_pre_io; memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); @@ -978,7 +978,7 @@ xfs_attr_leaf_create( XFS_ATTR_FORK); if (error) return(error); - ASSERT(bp != NULL); + bp->b_pre_io = xfs_attr_leaf_write_verify; leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 9ae7aba52e0f..6a0f3f9f39d3 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3124,6 +3124,7 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ + abp->b_pre_io = xfs_bmbt_write_verify; ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; @@ -3270,6 +3271,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + bp->b_pre_io = xfs_bmbt_write_verify; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 17d7423e7503..79758e1e4f74 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -749,7 +749,7 @@ xfs_bmbt_verify( } } -static void +void xfs_bmbt_write_verify( struct xfs_buf *bp) { @@ -806,6 +806,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .read_verify = xfs_bmbt_read_verify, + .write_verify = xfs_bmbt_write_verify, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 1d00fbe9dd79..938c85986549 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -233,6 +233,7 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern void xfs_bmbt_read_verify(struct xfs_buf *bp); +extern void xfs_bmbt_write_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index ef1066078c33..1e2d89eed2a4 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -996,6 +996,7 @@ xfs_btree_get_buf_block( if (!*bpp) return ENOMEM; + (*bpp)->b_pre_io = cur->bc_ops->write_verify; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 3a4c314047a0..458ab3550898 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -189,6 +189,8 @@ struct xfs_btree_ops { union xfs_btree_key *key); void (*read_verify)(struct xfs_buf *bp); + void (*write_verify)(struct xfs_buf *bp); + #ifdef DEBUG /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 6bb0a59eaaee..087950fc2eb7 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -193,6 +193,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + bp->b_pre_io = xfs_da_node_write_verify; *bpp = bp; return(0); } @@ -392,6 +393,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, } memcpy(node, oldroot, size); xfs_trans_log_buf(tp, bp, 0, size - 1); + + bp->b_pre_io = blk1->bp->b_pre_io; blk1->bp = bp; blk1->blkno = blkno; diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 0f8793c74fe2..e2fdc6f03d8a 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1010,6 +1010,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ + dbp->b_pre_io = xfs_dir2_block_write_verify; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; @@ -1139,6 +1140,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } + bp->b_pre_io = xfs_dir2_block_write_verify; hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index b555585f5ab6..dcb8a873ab92 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -185,7 +185,7 @@ __xfs_dir2_data_check( return 0; } -void +static void xfs_dir2_data_verify( struct xfs_buf *bp) { @@ -202,14 +202,14 @@ xfs_dir2_data_verify( } } -static void +void xfs_dir2_data_write_verify( struct xfs_buf *bp) { xfs_dir2_data_verify(bp); } -void +static void xfs_dir2_data_read_verify( struct xfs_buf *bp) { @@ -482,10 +482,9 @@ xfs_dir2_data_init( */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, XFS_DATA_FORK); - if (error) { + if (error) return error; - } - ASSERT(bp != NULL); + bp->b_pre_io = xfs_dir2_data_write_verify; /* * Initialize the header. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 5b3bcab2a656..3002ab7d54c3 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -81,7 +81,7 @@ xfs_dir2_leaf1_read_verify( xfs_buf_ioend(bp, 0); } -static void +void xfs_dir2_leafn_write_verify( struct xfs_buf *bp) { @@ -198,6 +198,7 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ + dbp->b_pre_io = xfs_dir2_data_write_verify; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); @@ -1243,15 +1244,14 @@ xfs_dir2_leaf_init( * Get the buffer for the block. */ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) { + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(bp != NULL); - leaf = bp->b_addr; + /* * Initialize the header. */ + leaf = bp->b_addr; leaf->hdr.info.magic = cpu_to_be16(magic); leaf->hdr.info.forw = 0; leaf->hdr.info.back = 0; @@ -1264,10 +1264,12 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { + bp->b_pre_io = xfs_dir2_leaf1_write_verify; ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); - } + } else + bp->b_pre_io = xfs_dir2_leafn_write_verify; *bpp = bp; return 0; } @@ -1951,7 +1953,10 @@ xfs_dir2_node_to_leaf( xfs_dir2_leaf_compact(args, lbp); else xfs_dir2_leaf_log_header(tp, lbp); + + lbp->b_pre_io = xfs_dir2_leaf1_write_verify; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + /* * Set up the leaf tail from the freespace block. */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index a58abe1fc0d0..da90a91f4420 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -197,11 +197,12 @@ xfs_dir2_leaf_to_node( /* * Get the buffer for the new freespace block. */ - if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, + XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_pre_io = xfs_dir2_free_write_verify; + free = fbp->b_addr; leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(mp, leaf); @@ -223,7 +224,10 @@ xfs_dir2_leaf_to_node( *to = cpu_to_be16(off); } free->hdr.nused = cpu_to_be32(n); + + lbp->b_pre_io = xfs_dir2_leafn_write_verify; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + /* * Log everything. */ @@ -632,6 +636,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_pre_io = xfs_dir2_data_write_verify; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } @@ -646,6 +651,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_pre_io = xfs_dir2_data_write_verify; } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -1638,12 +1644,12 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - if ((error = xfs_da_get_buf(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_da_get_buf(tp, dp, + xfs_dir2_db_to_da(mp, fbno), + -1, &fbp, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(fbp != NULL); + fbp->b_pre_io = xfs_dir2_free_write_verify; /* * Initialize the new block to be empty, and remember diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 7ec61af8449f..01b82dcddc3e 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -45,6 +45,7 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #else #define xfs_dir2_data_check(dp,bp) #endif +extern void xfs_dir2_data_write_verify(struct xfs_buf *bp); extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); @@ -73,6 +74,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, /* xfs_dir2_leaf.c */ extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); +extern void xfs_dir2_leafn_write_verify(struct xfs_buf *bp); extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b38a10e6f2e0..1b06aa051074 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -248,7 +248,57 @@ xfs_qm_init_dquot_blk( xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } +static void +xfs_dquot_buf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_disk_dquot *ddq; + xfs_dqid_t id = 0; + int i; + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_read_verify"); + if (error) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + } +} + +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); +} + +void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_dquot_buf_verify(bp); + bp->b_pre_io = xfs_dquot_buf_write_verify; + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} /* * Allocate a block and fill it with dquots. @@ -315,6 +365,7 @@ xfs_qm_dqalloc( error = xfs_buf_geterror(bp); if (error) goto error1; + bp->b_pre_io = xfs_dquot_buf_write_verify; /* * Make a chunk of dquots out of this buffer and log @@ -359,59 +410,6 @@ xfs_qm_dqalloc( return (error); } - -static void -xfs_dquot_buf_verify( - struct xfs_buf *bp) -{ - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - struct xfs_disk_dquot *ddq; - xfs_dqid_t id = 0; - int i; - - /* - * On the first read of the buffer, verify that each dquot is valid. - * We don't know what the id of the dquot is supposed to be, just that - * they should be increasing monotonically within the buffer. If the - * first id is corrupt, then it will fail on the second dquot in the - * buffer so corruptions could point to the wrong dquot in this case. - */ - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { - int error; - - ddq = &d[i].dd_diskdq; - - if (i == 0) - id = be32_to_cpu(ddq->d_id); - - error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_read_verify"); - if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d); - xfs_buf_ioerror(bp, EFSCORRUPTED); - break; - } - } -} - -static void -xfs_dquot_buf_write_verify( - struct xfs_buf *bp) -{ - xfs_dquot_buf_verify(bp); -} - -void -xfs_dquot_buf_read_verify( - struct xfs_buf *bp) -{ - xfs_dquot_buf_verify(bp); - bp->b_pre_io = xfs_dquot_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} - STATIC int xfs_qm_dqrepair( struct xfs_mount *mp, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cb65b067ed31..5d6d6b9d369d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -222,6 +222,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agf_write_verify; agf = XFS_BUF_TO_AGF(bp); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -259,6 +260,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agfl_write_verify; agfl = XFS_BUF_TO_AGFL(bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) @@ -279,6 +281,7 @@ xfs_growfs_data_private( error = ENOMEM; goto error0; } + bp->b_pre_io = xfs_agi_write_verify; agi = XFS_BUF_TO_AGI(bp); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -450,9 +453,10 @@ xfs_growfs_data_private( bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0); - if (bp) + if (bp) { xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - else + bp->b_pre_io = xfs_sb_write_verify; + } else error = ENOMEM; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 070f41845572..faf68600d3a6 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -210,6 +210,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ + fbuf->b_pre_io = xfs_inode_buf_write_verify; xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; @@ -1504,14 +1505,14 @@ xfs_agi_verify( xfs_check_agi_unlinked(agi); } -static void +void xfs_agi_write_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); } -void +static void xfs_agi_read_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 1fd6ea4e9c91..7a169e34e30e 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, /* * Get the data from the pointed-to record. */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, +int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +void xfs_agi_write_verify(struct xfs_buf *bp); + #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 15a79f8ca03c..7761e1ebeff7 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -271,6 +271,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .read_verify = xfs_inobt_read_verify, + .write_verify = xfs_inobt_write_verify, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 910b2da01042..dfcbe73f1db4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -420,7 +420,7 @@ xfs_inode_buf_verify( xfs_inobp_check(mp, bp); } -static void +void xfs_inode_buf_write_verify( struct xfs_buf *bp) { @@ -1782,6 +1782,18 @@ xfs_ifree_cluster( if (!bp) return ENOMEM; + + /* + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to + * attach stale cached inodes on it. That means it will never be + * dispatched for IO. If it is, we want to know about it, and we + * want it to fail. We can acheive this by adding a write + * verifier to the buffer. + */ + bp->b_pre_io = xfs_inode_buf_write_verify; + /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a322c19723a3..482214d120a7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -555,6 +555,7 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); void xfs_inode_buf_read_verify(struct xfs_buf *); +void xfs_inode_buf_write_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c85da75e4a43..152a7fc843f9 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -631,7 +631,7 @@ xfs_sb_verify( xfs_buf_ioerror(bp, error); } -static void +void xfs_sb_write_verify( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index de9089acc610..29c1b3ac920e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -386,6 +386,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ extern void xfs_sb_read_verify(struct xfs_buf *); +extern void xfs_sb_write_verify(struct xfs_buf *bp); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); From 1813dd64057490e7a0678a885c4fe6d02f78bdc1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 14 Nov 2012 17:54:40 +1100 Subject: [PATCH 70/78] xfs: convert buffer verifiers to an ops structure. To separate the verifiers from iodone functions and associate read and write verifiers at the same time, introduce a buffer verifier operations structure to the xfs_buf. This avoids the need for assigning the write verifier, clearing the iodone function and re-running ioend processing in the read verifier, and gets rid of the nasty "b_pre_io" name for the write verifier function pointer. If we ever need to, it will also be easier to add further content specific callbacks to a buffer with an ops structure in place. We also avoid needing to export verifier functions, instead we can simply export the ops structures for those that are needed outside the function they are defined in. This patch also fixes a directory block readahead verifier issue it exposed. This patch also adds ops callbacks to the inode/alloc btree blocks initialised by growfs. These will need more work before they will work with CRCs. Signed-off-by: Dave Chinner Reviewed-by: Phil White Signed-off-by: Ben Myers --- fs/xfs/xfs_ag.h | 4 +++ fs/xfs/xfs_alloc.c | 28 +++++++++-------- fs/xfs/xfs_alloc.h | 4 +-- fs/xfs/xfs_alloc_btree.c | 24 ++++++++------- fs/xfs/xfs_alloc_btree.h | 2 ++ fs/xfs/xfs_attr_leaf.c | 27 +++++++++-------- fs/xfs/xfs_attr_leaf.h | 3 +- fs/xfs/xfs_bmap.c | 22 +++++++------- fs/xfs/xfs_bmap_btree.c | 26 ++++++++-------- fs/xfs/xfs_bmap_btree.h | 3 +- fs/xfs/xfs_btree.c | 26 ++++++++-------- fs/xfs/xfs_btree.h | 9 +++--- fs/xfs/xfs_buf.c | 63 +++++++++++++++++++++++---------------- fs/xfs/xfs_buf.h | 24 ++++++++------- fs/xfs/xfs_da_btree.c | 40 +++++++++++++++---------- fs/xfs/xfs_da_btree.h | 4 +-- fs/xfs/xfs_dir2_block.c | 26 ++++++++-------- fs/xfs/xfs_dir2_data.c | 54 ++++++++++++++++++++++++++------- fs/xfs/xfs_dir2_leaf.c | 44 ++++++++++++++------------- fs/xfs/xfs_dir2_node.c | 32 ++++++++++---------- fs/xfs/xfs_dir2_priv.h | 10 +++++-- fs/xfs/xfs_dquot.c | 18 ++++++----- fs/xfs/xfs_dquot.h | 3 +- fs/xfs/xfs_fsops.c | 29 ++++++++++-------- fs/xfs/xfs_ialloc.c | 18 ++++++----- fs/xfs/xfs_ialloc.h | 2 +- fs/xfs/xfs_ialloc_btree.c | 23 +++++++------- fs/xfs/xfs_ialloc_btree.h | 2 ++ fs/xfs/xfs_inode.c | 28 +++++++++-------- fs/xfs/xfs_inode.h | 3 +- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_mount.c | 35 +++++++++++++--------- fs/xfs/xfs_mount.h | 4 +-- fs/xfs/xfs_qm.c | 2 +- fs/xfs/xfs_trans.h | 6 ++-- fs/xfs/xfs_trans_buf.c | 8 ++--- 37 files changed, 384 insertions(+), 276 deletions(-) diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 22bd4db011c8..f2aeedb6a579 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -108,6 +108,8 @@ typedef struct xfs_agf { extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; + /* * Size of the unlinked inode hash table in the agi. */ @@ -161,6 +163,8 @@ typedef struct xfs_agi { extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; + /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 545a6c4c2366..393055fe3aef 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -465,7 +465,7 @@ xfs_agfl_verify( #endif } -void +static void xfs_agfl_write_verify( struct xfs_buf *bp) { @@ -477,11 +477,13 @@ xfs_agfl_read_verify( struct xfs_buf *bp) { xfs_agfl_verify(bp); - bp->b_pre_io = xfs_agfl_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + /* * Read in the allocation group free block array. */ @@ -499,7 +501,7 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, xfs_agfl_read_verify); + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -2181,23 +2183,25 @@ xfs_agf_verify( } } -void -xfs_agf_write_verify( +static void +xfs_agf_read_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); } static void -xfs_agf_read_verify( +xfs_agf_write_verify( struct xfs_buf *bp) { xfs_agf_verify(bp); - bp->b_pre_io = xfs_agf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + /* * Read in the allocation group header (free/alloc section). */ @@ -2215,7 +2219,7 @@ xfs_read_agf( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, xfs_agf_read_verify); + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; if (!*bpp) diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index f32811f50f43..99d0a6101558 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -231,7 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -void xfs_agf_write_verify(struct xfs_buf *bp); -void xfs_agfl_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b83396524913..b1ddef6b2689 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -328,6 +328,13 @@ xfs_allocbt_verify( } } +static void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + xfs_allocbt_verify(bp); +} + static void xfs_allocbt_write_verify( struct xfs_buf *bp) @@ -335,15 +342,11 @@ xfs_allocbt_write_verify( xfs_allocbt_verify(bp); } -void -xfs_allocbt_read_verify( - struct xfs_buf *bp) -{ - xfs_allocbt_verify(bp); - bp->b_pre_io = xfs_allocbt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; + #ifdef DEBUG STATIC int @@ -400,8 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, - .read_verify = xfs_allocbt_read_verify, - .write_verify = xfs_allocbt_write_verify, + .buf_ops = &xfs_allocbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 359fb86ed876..7e89a2b429dd 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 5cd5b0c1d17a..ee24993c7d12 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -103,6 +103,13 @@ xfs_attr_leaf_verify( } } +static void +xfs_attr_leaf_read_verify( + struct xfs_buf *bp) +{ + xfs_attr_leaf_verify(bp); +} + static void xfs_attr_leaf_write_verify( struct xfs_buf *bp) @@ -110,16 +117,10 @@ xfs_attr_leaf_write_verify( xfs_attr_leaf_verify(bp); } -void -xfs_attr_leaf_read_verify( - struct xfs_buf *bp) -{ - xfs_attr_leaf_verify(bp); - bp->b_pre_io = xfs_attr_leaf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} - +const struct xfs_buf_ops xfs_attr_leaf_buf_ops = { + .verify_read = xfs_attr_leaf_read_verify, + .verify_write = xfs_attr_leaf_write_verify, +}; int xfs_attr_leaf_read( @@ -130,7 +131,7 @@ xfs_attr_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - XFS_ATTR_FORK, xfs_attr_leaf_read_verify); + XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops); } /*======================================================================== @@ -924,7 +925,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args) XFS_ATTR_FORK); if (error) goto out; - bp2->b_pre_io = bp1->b_pre_io; + bp2->b_ops = bp1->b_ops; memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); bp1 = NULL; xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); @@ -978,7 +979,7 @@ xfs_attr_leaf_create( XFS_ATTR_FORK); if (error) return(error); - bp->b_pre_io = xfs_attr_leaf_write_verify; + bp->b_ops = &xfs_attr_leaf_buf_ops; leaf = bp->b_addr; memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); hdr = &leaf->hdr; diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 3bbf6277e43c..77de139a58f0 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -264,6 +264,7 @@ int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr_leaf_read_verify(struct xfs_buf *bp); + +extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops; #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 6a0f3f9f39d3..0e92d12765d2 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2663,7 +2663,7 @@ xfs_bmap_btree_to_extents( return error; #endif error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); @@ -3124,7 +3124,7 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ - abp->b_pre_io = xfs_bmbt_write_verify; + abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; @@ -3271,7 +3271,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - bp->b_pre_io = xfs_bmbt_write_verify; + bp->b_ops = &xfs_bmbt_buf_ops; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); @@ -4082,7 +4082,7 @@ xfs_bmap_read_extents( */ while (level-- > 0) { error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) return error; block = XFS_BUF_TO_BLOCK(bp); @@ -4129,7 +4129,7 @@ xfs_bmap_read_extents( nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) xfs_btree_reada_bufl(mp, nextbno, 1, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); /* * Copy records into the extent records. */ @@ -4162,7 +4162,7 @@ xfs_bmap_read_extents( if (bno == NULLFSBLOCK) break; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify); + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) return error; block = XFS_BUF_TO_BLOCK(bp); @@ -5880,7 +5880,7 @@ xfs_bmap_check_leaf_extents( bp_release = 1; error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) goto error_norelse; } @@ -5966,7 +5966,7 @@ xfs_bmap_check_leaf_extents( bp_release = 1; error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) goto error_norelse; } @@ -6061,7 +6061,7 @@ xfs_bmap_count_tree( int numrecs; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; @@ -6073,7 +6073,7 @@ xfs_bmap_count_tree( while (nextbno != NULLFSBLOCK) { error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; @@ -6105,7 +6105,7 @@ xfs_bmap_count_tree( bno = nextbno; error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, - xfs_bmbt_read_verify); + &xfs_bmbt_buf_ops); if (error) return error; *count += 1; diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 79758e1e4f74..061b45cbe614 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -749,22 +749,25 @@ xfs_bmbt_verify( } } -void +static void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + xfs_bmbt_verify(bp); +} + +static void xfs_bmbt_write_verify( struct xfs_buf *bp) { xfs_bmbt_verify(bp); } -void -xfs_bmbt_read_verify( - struct xfs_buf *bp) -{ - xfs_bmbt_verify(bp); - bp->b_pre_io = xfs_bmbt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} +const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .verify_read = xfs_bmbt_read_verify, + .verify_write = xfs_bmbt_write_verify, +}; + #ifdef DEBUG STATIC int @@ -805,8 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, - .read_verify = xfs_bmbt_read_verify, - .write_verify = xfs_bmbt_write_verify, + .buf_ops = &xfs_bmbt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 938c85986549..88469ca08696 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -232,11 +232,10 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); -extern void xfs_bmbt_read_verify(struct xfs_buf *bp); -extern void xfs_bmbt_write_verify(struct xfs_buf *bp); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 1e2d89eed2a4..db010408d701 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -271,7 +271,7 @@ xfs_btree_dup_cursor( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -621,7 +621,7 @@ xfs_btree_read_bufl( uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ @@ -630,7 +630,7 @@ xfs_btree_read_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp, verify); + mp->m_bsize, lock, &bp, ops); if (error) return error; ASSERT(!xfs_buf_geterror(bp)); @@ -650,13 +650,13 @@ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } /* @@ -670,14 +670,14 @@ xfs_btree_reada_bufs( xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } STATIC int @@ -692,13 +692,13 @@ xfs_btree_readahead_lblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { xfs_btree_reada_bufl(cur->bc_mp, left, 1, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { xfs_btree_reada_bufl(cur->bc_mp, right, 1, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); rval++; } @@ -718,13 +718,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - left, 1, cur->bc_ops->read_verify); + left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - right, 1, cur->bc_ops->read_verify); + right, 1, cur->bc_ops->buf_ops); rval++; } @@ -996,7 +996,7 @@ xfs_btree_get_buf_block( if (!*bpp) return ENOMEM; - (*bpp)->b_pre_io = cur->bc_ops->write_verify; + (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } @@ -1024,7 +1024,7 @@ xfs_btree_read_buf_block( d = xfs_btree_ptr_to_daddr(cur, ptr); error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags, bpp, - cur->bc_ops->read_verify); + cur->bc_ops->buf_ops); if (error) return error; diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 458ab3550898..f932897194eb 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -188,8 +188,7 @@ struct xfs_btree_ops { __int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); - void (*read_verify)(struct xfs_buf *bp); - void (*write_verify)(struct xfs_buf *bp); + const struct xfs_buf_ops *buf_ops; #ifdef DEBUG /* check that k1 is lower than k2 */ @@ -359,7 +358,7 @@ xfs_btree_read_bufl( uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -370,7 +369,7 @@ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -382,7 +381,7 @@ xfs_btree_reada_bufs( xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count, /* count of filesystem blocks */ - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); /* * Initialise a new btree block header diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bd1a948ee39c..26673a0b20e7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -571,7 +571,7 @@ found: ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); ASSERT(bp->b_iodone == NULL); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; - bp->b_pre_io = NULL; + bp->b_ops = NULL; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -657,7 +657,7 @@ xfs_buf_read_map( struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -669,7 +669,7 @@ xfs_buf_read_map( if (!XFS_BUF_ISDONE(bp)) { XFS_STATS_INC(xb_get_read); - bp->b_iodone = verify; + bp->b_ops = ops; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { /* @@ -696,13 +696,13 @@ xfs_buf_readahead_map( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify); + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); } /* @@ -715,7 +715,7 @@ xfs_buf_read_uncached( xfs_daddr_t daddr, size_t numblks, int flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -728,7 +728,7 @@ xfs_buf_read_uncached( bp->b_bn = daddr; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; - bp->b_iodone = verify; + bp->b_ops = ops; xfsbdstrat(target->bt_mount, bp); xfs_buf_iowait(bp); @@ -1001,27 +1001,37 @@ STATIC void xfs_buf_iodone_work( struct work_struct *work) { - xfs_buf_t *bp = + struct xfs_buf *bp = container_of(work, xfs_buf_t, b_iodone_work); + bool read = !!(bp->b_flags & XBF_READ); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + if (read && bp->b_ops) + bp->b_ops->verify_read(bp); if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); + else { + ASSERT(read && bp->b_ops); + complete(&bp->b_iowait); + } } void xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) + struct xfs_buf *bp, + int schedule) { + bool read = !!(bp->b_flags & XBF_READ); + trace_xfs_buf_iodone(bp, _RET_IP_); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; - if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { + if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); queue_work(xfslogd_workqueue, &bp->b_iodone_work); @@ -1029,6 +1039,7 @@ xfs_buf_ioend( xfs_buf_iodone_work(&bp->b_iodone_work); } } else { + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); complete(&bp->b_iowait); } } @@ -1316,6 +1327,20 @@ _xfs_buf_ioapply( rw |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } } else if (bp->b_flags & XBF_READ_AHEAD) { rw = READA; } else { @@ -1325,20 +1350,6 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ rw |= REQ_META; - /* - * run the pre-io callback function if it exists. If this function - * fails it will mark the buffer with an error and the IO should - * not be dispatched. - */ - if (bp->b_pre_io) { - bp->b_pre_io(bp); - if (bp->b_error) { - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_CORRUPT_INCORE); - return; - } - } - /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 51bc16a1cd9c..23f5642480bb 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -111,6 +111,11 @@ struct xfs_buf_map { #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; +struct xfs_buf_ops { + void (*verify_read)(struct xfs_buf *); + void (*verify_write)(struct xfs_buf *); +}; + typedef struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache @@ -154,9 +159,7 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ unsigned short b_error; /* error code on I/O */ - - void (*b_pre_io)(struct xfs_buf *); - /* pre-io callback function */ + const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; @@ -199,10 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, xfs_buf_flags_t flags); struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, xfs_buf_iodone_t verify); + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops); void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); static inline struct xfs_buf * xfs_buf_get( @@ -221,10 +225,10 @@ xfs_buf_read( xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_read_map(target, &map, 1, flags, verify); + return xfs_buf_read_map(target, &map, 1, flags, ops); } static inline void @@ -232,10 +236,10 @@ xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_readahead_map(target, &map, 1, verify); + return xfs_buf_readahead_map(target, &map, 1, ops); } struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); @@ -246,7 +250,7 @@ struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 087950fc2eb7..4d7696a02418 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -117,6 +117,12 @@ xfs_da_node_write_verify( xfs_da_node_verify(bp); } +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ static void xfs_da_node_read_verify( struct xfs_buf *bp) @@ -129,10 +135,12 @@ xfs_da_node_read_verify( xfs_da_node_verify(bp); break; case XFS_ATTR_LEAF_MAGIC: - xfs_attr_leaf_read_verify(bp); + bp->b_ops = &xfs_attr_leaf_buf_ops; + bp->b_ops->verify_read(bp); return; case XFS_DIR2_LEAFN_MAGIC: - xfs_dir2_leafn_read_verify(bp); + bp->b_ops = &xfs_dir2_leafn_buf_ops; + bp->b_ops->verify_read(bp); return; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, @@ -140,12 +148,14 @@ xfs_da_node_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); break; } - - bp->b_pre_io = xfs_da_node_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_da_node_buf_ops = { + .verify_read = xfs_da_node_read_verify, + .verify_write = xfs_da_node_write_verify, +}; + + int xfs_da_node_read( struct xfs_trans *tp, @@ -156,7 +166,7 @@ xfs_da_node_read( int which_fork) { return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, - which_fork, xfs_da_node_read_verify); + which_fork, &xfs_da_node_buf_ops); } /*======================================================================== @@ -193,7 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, xfs_trans_log_buf(tp, bp, XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); - bp->b_pre_io = xfs_da_node_write_verify; + bp->b_ops = &xfs_da_node_buf_ops; *bpp = bp; return(0); } @@ -394,7 +404,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, memcpy(node, oldroot, size); xfs_trans_log_buf(tp, bp, 0, size - 1); - bp->b_pre_io = blk1->bp->b_pre_io; + bp->b_ops = blk1->bp->b_ops; blk1->bp = bp; blk1->blkno = blkno; @@ -828,11 +838,11 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) /* * This could be copying a leaf back into the root block in the case of * there only being a single leaf block left in the tree. Hence we have - * to update the pre_io pointer as well to match the buffer type change + * to update the b_ops pointer as well to match the buffer type change * that could occur. */ memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); - root_blk->bp->b_pre_io = bp->b_pre_io; + root_blk->bp->b_ops = bp->b_ops; xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); @@ -2223,7 +2233,7 @@ xfs_da_read_buf( xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, - xfs_buf_iodone_t verifier) + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; struct xfs_buf_map map; @@ -2245,7 +2255,7 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(dp->i_mount, trans, dp->i_mount->m_ddev_targp, - mapp, nmap, 0, &bp, verifier); + mapp, nmap, 0, &bp, ops); if (error) goto out_free; @@ -2303,7 +2313,7 @@ xfs_da_reada_buf( xfs_dablk_t bno, xfs_daddr_t mappedbno, int whichfork, - xfs_buf_iodone_t verifier) + const struct xfs_buf_ops *ops) { struct xfs_buf_map map; struct xfs_buf_map *mapp; @@ -2322,7 +2332,7 @@ xfs_da_reada_buf( } mappedbno = mapp[0].bm_bn; - xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL); + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); out_free: if (mapp != &map) diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 521b008445ab..ee5170c46ae1 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -229,10 +229,10 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, - xfs_buf_iodone_t verifier); + const struct xfs_buf_ops *ops); xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, - int whichfork, xfs_buf_iodone_t verifier); + int whichfork, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e2fdc6f03d8a..7536faaa61e7 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -73,6 +73,13 @@ xfs_dir2_block_verify( } } +static void +xfs_dir2_block_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_block_verify(bp); +} + static void xfs_dir2_block_write_verify( struct xfs_buf *bp) @@ -80,15 +87,10 @@ xfs_dir2_block_write_verify( xfs_dir2_block_verify(bp); } -void -xfs_dir2_block_read_verify( - struct xfs_buf *bp) -{ - xfs_dir2_block_verify(bp); - bp->b_pre_io = xfs_dir2_block_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} +const struct xfs_buf_ops xfs_dir2_block_buf_ops = { + .verify_read = xfs_dir2_block_read_verify, + .verify_write = xfs_dir2_block_write_verify, +}; static int xfs_dir2_block_read( @@ -99,7 +101,7 @@ xfs_dir2_block_read( struct xfs_mount *mp = dp->i_mount; return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, - XFS_DATA_FORK, xfs_dir2_block_read_verify); + XFS_DATA_FORK, &xfs_dir2_block_buf_ops); } static void @@ -1010,7 +1012,7 @@ xfs_dir2_leaf_to_block( /* * Start converting it to block form. */ - dbp->b_pre_io = xfs_dir2_block_write_verify; + dbp->b_ops = &xfs_dir2_block_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); needlog = 1; needscan = 0; @@ -1140,7 +1142,7 @@ xfs_dir2_sf_to_block( kmem_free(sfp); return error; } - bp->b_pre_io = xfs_dir2_block_write_verify; + bp->b_ops = &xfs_dir2_block_buf_ops; hdr = bp->b_addr; hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index dcb8a873ab92..ffcf1774152e 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -202,11 +202,31 @@ xfs_dir2_data_verify( } } -void -xfs_dir2_data_write_verify( - struct xfs_buf *bp) +/* + * Readahead of the first block of the directory when it is opened is completely + * oblivious to the format of the directory. Hence we can either get a block + * format buffer or a data format buffer on readahead. + */ +static void +xfs_dir2_data_reada_verify( + struct xfs_buf *bp) { - xfs_dir2_data_verify(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + bp->b_ops = &xfs_dir2_block_buf_ops; + bp->b_ops->verify_read(bp); + return; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + xfs_dir2_data_verify(bp); + return; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } } static void @@ -214,11 +234,25 @@ xfs_dir2_data_read_verify( struct xfs_buf *bp) { xfs_dir2_data_verify(bp); - bp->b_pre_io = xfs_dir2_data_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +static void +xfs_dir2_data_write_verify( + struct xfs_buf *bp) +{ + xfs_dir2_data_verify(bp); +} + +const struct xfs_buf_ops xfs_dir2_data_buf_ops = { + .verify_read = xfs_dir2_data_read_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + +static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = { + .verify_read = xfs_dir2_data_reada_verify, + .verify_write = xfs_dir2_data_write_verify, +}; + int xfs_dir2_data_read( @@ -229,7 +263,7 @@ xfs_dir2_data_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, - XFS_DATA_FORK, xfs_dir2_data_read_verify); + XFS_DATA_FORK, &xfs_dir2_data_buf_ops); } int @@ -240,7 +274,7 @@ xfs_dir2_data_readahead( xfs_daddr_t mapped_bno) { return xfs_da_reada_buf(tp, dp, bno, mapped_bno, - XFS_DATA_FORK, xfs_dir2_data_read_verify); + XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops); } /* @@ -484,7 +518,7 @@ xfs_dir2_data_init( XFS_DATA_FORK); if (error) return error; - bp->b_pre_io = xfs_dir2_data_write_verify; + bp->b_ops = &xfs_dir2_data_buf_ops; /* * Initialize the header. diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 3002ab7d54c3..60cd2fa4e047 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -65,20 +65,24 @@ xfs_dir2_leaf_verify( } static void -xfs_dir2_leaf1_write_verify( +xfs_dir2_leaf1_read_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); } static void -xfs_dir2_leaf1_read_verify( +xfs_dir2_leaf1_write_verify( struct xfs_buf *bp) { xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)); - bp->b_pre_io = xfs_dir2_leaf1_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); +} + +void +xfs_dir2_leafn_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); } void @@ -88,15 +92,15 @@ xfs_dir2_leafn_write_verify( xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); } -void -xfs_dir2_leafn_read_verify( - struct xfs_buf *bp) -{ - xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)); - bp->b_pre_io = xfs_dir2_leafn_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} +static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = { + .verify_read = xfs_dir2_leaf1_read_verify, + .verify_write = xfs_dir2_leaf1_write_verify, +}; + +const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = { + .verify_read = xfs_dir2_leafn_read_verify, + .verify_write = xfs_dir2_leafn_write_verify, +}; static int xfs_dir2_leaf_read( @@ -107,7 +111,7 @@ xfs_dir2_leaf_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leaf1_read_verify); + XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops); } int @@ -119,7 +123,7 @@ xfs_dir2_leafn_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_leafn_read_verify); + XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops); } /* @@ -198,7 +202,7 @@ xfs_dir2_block_to_leaf( /* * Fix up the block header, make it a data block. */ - dbp->b_pre_io = xfs_dir2_data_write_verify; + dbp->b_ops = &xfs_dir2_data_buf_ops; hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); if (needscan) xfs_dir2_data_freescan(mp, hdr, &needlog); @@ -1264,12 +1268,12 @@ xfs_dir2_leaf_init( * the block. */ if (magic == XFS_DIR2_LEAF1_MAGIC) { - bp->b_pre_io = xfs_dir2_leaf1_write_verify; + bp->b_ops = &xfs_dir2_leaf1_buf_ops; ltp = xfs_dir2_leaf_tail_p(mp, leaf); ltp->bestcount = 0; xfs_dir2_leaf_log_tail(tp, bp); } else - bp->b_pre_io = xfs_dir2_leafn_write_verify; + bp->b_ops = &xfs_dir2_leafn_buf_ops; *bpp = bp; return 0; } @@ -1954,7 +1958,7 @@ xfs_dir2_node_to_leaf( else xfs_dir2_leaf_log_header(tp, lbp); - lbp->b_pre_io = xfs_dir2_leaf1_write_verify; + lbp->b_ops = &xfs_dir2_leaf1_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); /* diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index da90a91f4420..5980f9b7fa9b 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -71,6 +71,13 @@ xfs_dir2_free_verify( } } +static void +xfs_dir2_free_read_verify( + struct xfs_buf *bp) +{ + xfs_dir2_free_verify(bp); +} + static void xfs_dir2_free_write_verify( struct xfs_buf *bp) @@ -78,15 +85,10 @@ xfs_dir2_free_write_verify( xfs_dir2_free_verify(bp); } -void -xfs_dir2_free_read_verify( - struct xfs_buf *bp) -{ - xfs_dir2_free_verify(bp); - bp->b_pre_io = xfs_dir2_free_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} +static const struct xfs_buf_ops xfs_dir2_free_buf_ops = { + .verify_read = xfs_dir2_free_read_verify, + .verify_write = xfs_dir2_free_write_verify, +}; static int @@ -98,7 +100,7 @@ __xfs_dir2_free_read( struct xfs_buf **bpp) { return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, - XFS_DATA_FORK, xfs_dir2_free_read_verify); + XFS_DATA_FORK, &xfs_dir2_free_buf_ops); } int @@ -201,7 +203,7 @@ xfs_dir2_leaf_to_node( XFS_DATA_FORK); if (error) return error; - fbp->b_pre_io = xfs_dir2_free_write_verify; + fbp->b_ops = &xfs_dir2_free_buf_ops; free = fbp->b_addr; leaf = lbp->b_addr; @@ -225,7 +227,7 @@ xfs_dir2_leaf_to_node( } free->hdr.nused = cpu_to_be32(n); - lbp->b_pre_io = xfs_dir2_leafn_write_verify; + lbp->b_ops = &xfs_dir2_leafn_buf_ops; leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); /* @@ -636,7 +638,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = (int)((char *)dep - (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_pre_io = xfs_dir2_data_write_verify; + curbp->b_ops = &xfs_dir2_data_buf_ops; if (cmp == XFS_CMP_EXACT) return XFS_ERROR(EEXIST); } @@ -651,7 +653,7 @@ xfs_dir2_leafn_lookup_for_entry( state->extrablk.index = -1; state->extrablk.blkno = curdb; state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - curbp->b_pre_io = xfs_dir2_data_write_verify; + curbp->b_ops = &xfs_dir2_data_buf_ops; } else { /* If the curbp is not the CI match block, drop it */ if (state->extrablk.bp != curbp) @@ -1649,7 +1651,7 @@ xfs_dir2_node_addname_int( -1, &fbp, XFS_DATA_FORK); if (error) return error; - fbp->b_pre_io = xfs_dir2_free_write_verify; + fbp->b_ops = &xfs_dir2_free_buf_ops; /* * Initialize the new block to be empty, and remember diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 01b82dcddc3e..7da79f6515fd 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); /* xfs_dir2_block.c */ +extern const struct xfs_buf_ops xfs_dir2_block_buf_ops; + extern int xfs_dir2_block_addname(struct xfs_da_args *args); extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, xfs_off_t *offset, filldir_t filldir); @@ -45,7 +47,9 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, #else #define xfs_dir2_data_check(dp,bp) #endif -extern void xfs_dir2_data_write_verify(struct xfs_buf *bp); + +extern const struct xfs_buf_ops xfs_dir2_data_buf_ops; + extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); @@ -73,8 +77,8 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); /* xfs_dir2_leaf.c */ -extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp); -extern void xfs_dir2_leafn_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops; + extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1b06aa051074..9e1bf5294c91 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -284,22 +284,24 @@ xfs_dquot_buf_verify( } static void -xfs_dquot_buf_write_verify( +xfs_dquot_buf_read_verify( struct xfs_buf *bp) { xfs_dquot_buf_verify(bp); } void -xfs_dquot_buf_read_verify( +xfs_dquot_buf_write_verify( struct xfs_buf *bp) { xfs_dquot_buf_verify(bp); - bp->b_pre_io = xfs_dquot_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; + /* * Allocate a block and fill it with dquots. * This is called when the bmapi finds a hole. @@ -365,7 +367,7 @@ xfs_qm_dqalloc( error = xfs_buf_geterror(bp); if (error) goto error1; - bp->b_pre_io = xfs_dquot_buf_write_verify; + bp->b_ops = &xfs_dquot_buf_ops; /* * Make a chunk of dquots out of this buffer and log @@ -435,7 +437,7 @@ xfs_qm_dqrepair( ASSERT(*bpp == NULL); return XFS_ERROR(error); } - (*bpp)->b_pre_io = xfs_dquot_buf_write_verify; + (*bpp)->b_ops = &xfs_dquot_buf_ops; ASSERT(xfs_buf_islocked(*bpp)); d = (struct xfs_dqblk *)(*bpp)->b_addr; @@ -534,7 +536,7 @@ xfs_qm_dqtobp( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, - 0, &bp, xfs_dquot_buf_read_verify); + 0, &bp, &xfs_dquot_buf_ops); if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 5438d883b628..c694a8469c4a 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -140,7 +140,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); -extern void xfs_dquot_buf_read_verify(struct xfs_buf *bp); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); @@ -162,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } +extern const struct xfs_buf_ops xfs_dquot_buf_ops; + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5d6d6b9d369d..94eaeedc5498 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -119,7 +119,8 @@ xfs_growfs_get_hdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, - int flags) + int flags, + const struct xfs_buf_ops *ops) { struct xfs_buf *bp; @@ -130,6 +131,7 @@ xfs_growfs_get_hdr_buf( xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; return bp; } @@ -217,12 +219,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agf_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agf_write_verify; agf = XFS_BUF_TO_AGF(bp); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -255,12 +257,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agfl_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agfl_write_verify; agfl = XFS_BUF_TO_AGFL(bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) @@ -276,12 +278,12 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agi_buf_ops); if (!bp) { error = ENOMEM; goto error0; } - bp->b_pre_io = xfs_agi_write_verify; agi = XFS_BUF_TO_AGI(bp); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -306,7 +308,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); if (!bp) { error = ENOMEM; @@ -329,7 +332,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); if (!bp) { error = ENOMEM; goto error0; @@ -352,7 +356,8 @@ xfs_growfs_data_private( */ bp = xfs_growfs_get_hdr_buf(mp, XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); if (!bp) { error = ENOMEM; goto error0; @@ -448,14 +453,14 @@ xfs_growfs_data_private( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, - xfs_sb_read_verify); + &xfs_sb_buf_ops); } else { bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0); if (bp) { + bp->b_ops = &xfs_sb_buf_ops; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - bp->b_pre_io = xfs_sb_write_verify; } else error = ENOMEM; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index faf68600d3a6..2d6495eaaa34 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -210,7 +210,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ - fbuf->b_pre_io = xfs_inode_buf_write_verify; + fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; @@ -1505,23 +1505,25 @@ xfs_agi_verify( xfs_check_agi_unlinked(agi); } -void -xfs_agi_write_verify( +static void +xfs_agi_read_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); } static void -xfs_agi_read_verify( +xfs_agi_write_verify( struct xfs_buf *bp) { xfs_agi_verify(bp); - bp->b_pre_io = xfs_agi_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } +const struct xfs_buf_ops xfs_agi_buf_ops = { + .verify_read = xfs_agi_read_verify, + .verify_write = xfs_agi_write_verify, +}; + /* * Read in the allocation group header (inode allocation section) */ @@ -1538,7 +1540,7 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, xfs_agi_read_verify); + XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 7a169e34e30e..c8da3df271e6 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -150,6 +150,6 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); -void xfs_agi_write_verify(struct xfs_buf *bp); +extern const struct xfs_buf_ops xfs_agi_buf_ops; #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 7761e1ebeff7..bec344b36507 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -216,6 +216,13 @@ xfs_inobt_verify( } } +static void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + xfs_inobt_verify(bp); +} + static void xfs_inobt_write_verify( struct xfs_buf *bp) @@ -223,15 +230,10 @@ xfs_inobt_write_verify( xfs_inobt_verify(bp); } -void -xfs_inobt_read_verify( - struct xfs_buf *bp) -{ - xfs_inobt_verify(bp); - bp->b_pre_io = xfs_inobt_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} +const struct xfs_buf_ops xfs_inobt_buf_ops = { + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, +}; #ifdef DEBUG STATIC int @@ -270,8 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .read_verify = xfs_inobt_read_verify, - .write_verify = xfs_inobt_write_verify, + .buf_ops = &xfs_inobt_buf_ops, #ifdef DEBUG .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index f782ad0c4769..25c0239a8eab 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +extern const struct xfs_buf_ops xfs_inobt_buf_ops; + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index dfcbe73f1db4..66282dcb821b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -420,22 +420,26 @@ xfs_inode_buf_verify( xfs_inobp_check(mp, bp); } -void + +static void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp); +} + +static void xfs_inode_buf_write_verify( struct xfs_buf *bp) { xfs_inode_buf_verify(bp); } -void -xfs_inode_buf_read_verify( - struct xfs_buf *bp) -{ - xfs_inode_buf_verify(bp); - bp->b_pre_io = xfs_inode_buf_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); -} +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + /* * This routine is called to map an inode to the buffer containing the on-disk @@ -462,7 +466,7 @@ xfs_imap_to_bp( buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp, - xfs_inode_buf_read_verify); + &xfs_inode_buf_ops); if (error) { if (error == EAGAIN) { ASSERT(buf_flags & XBF_TRYLOCK); @@ -1792,7 +1796,7 @@ xfs_ifree_cluster( * want it to fail. We can acheive this by adding a write * verifier to the buffer. */ - bp->b_pre_io = xfs_inode_buf_write_verify; + bp->b_ops = &xfs_inode_buf_ops; /* * Walk the inodes already attached to the buffer and mark them diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 482214d120a7..22baf6ea4fac 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -554,8 +554,6 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, uint); -void xfs_inode_buf_read_verify(struct xfs_buf *); -void xfs_inode_buf_write_verify(struct xfs_buf *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); @@ -600,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; +extern const struct xfs_buf_ops xfs_inode_buf_ops; #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7f86fdaab7ae..2ea7d402188d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -397,7 +397,7 @@ xfs_bulkstat( & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster, - xfs_inode_buf_read_verify); + &xfs_inode_buf_ops); } irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 924a4bc3d49a..931e8e23f192 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3699,7 +3699,7 @@ xlog_do_recover( ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); - bp->b_iodone = xfs_sb_read_verify; + bp->b_ops = &xfs_sb_buf_ops; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); if (error) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 152a7fc843f9..da508463ff10 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -631,21 +631,11 @@ xfs_sb_verify( xfs_buf_ioerror(bp, error); } -void -xfs_sb_write_verify( - struct xfs_buf *bp) -{ - xfs_sb_verify(bp); -} - -void +static void xfs_sb_read_verify( struct xfs_buf *bp) { xfs_sb_verify(bp); - bp->b_pre_io = xfs_sb_write_verify; - bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); } /* @@ -654,7 +644,7 @@ xfs_sb_read_verify( * If we find an XFS superblock, the run a normal, noisy mount because we are * really going to mount it and want to know about errors. */ -void +static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { @@ -671,6 +661,23 @@ xfs_sb_quiet_read_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); } +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + xfs_sb_verify(bp); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + /* * xfs_readsb * @@ -697,8 +704,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) reread: bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), 0, - loud ? xfs_sb_read_verify - : xfs_sb_quiet_read_verify); + loud ? &xfs_sb_buf_ops + : &xfs_sb_quiet_buf_ops); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 29c1b3ac920e..bab8314507e4 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -385,12 +385,12 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ -extern void xfs_sb_read_verify(struct xfs_buf *); -extern void xfs_sb_write_verify(struct xfs_buf *bp); extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, xfs_agnumber_t *); extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern const struct xfs_buf_ops xfs_sb_buf_ops; + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index bd40ae9624e5..e6a0af0ba007 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -893,7 +893,7 @@ xfs_qm_dqiter_bufs( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, - xfs_dquot_buf_read_verify); + &xfs_dquot_buf_ops); if (error) break; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f02d40296506..c6c0601abd7a 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -474,7 +474,7 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify); + const struct xfs_buf_ops *ops); static inline int xfs_trans_read_buf( @@ -485,11 +485,11 @@ xfs_trans_read_buf( int numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_trans_read_buf_map(mp, tp, target, &map, 1, - flags, bpp, verify); + flags, bpp, ops); } struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 977628207b45..4fc17d479d42 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -258,7 +258,7 @@ xfs_trans_read_buf_map( int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, - xfs_buf_iodone_t verify) + const struct xfs_buf_ops *ops) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; @@ -266,7 +266,7 @@ xfs_trans_read_buf_map( *bpp = NULL; if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags, verify); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -315,7 +315,7 @@ xfs_trans_read_buf_map( ASSERT(!XFS_BUF_ISASYNC(bp)); ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); - bp->b_iodone = verify; + bp->b_ops = ops; xfsbdstrat(tp->t_mountp, bp); error = xfs_buf_iowait(bp); if (error) { @@ -352,7 +352,7 @@ xfs_trans_read_buf_map( return 0; } - bp = xfs_buf_read_map(target, map, nmaps, flags, verify); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? From bc02e8693d875c2a9b0037cfd37fe0b726d26403 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Nov 2012 09:20:37 +1100 Subject: [PATCH 71/78] xfs: add CRC infrastructure - add a mount feature bit for CRC enabled filesystems - add some helpers for generating and verifying the CRCs - add a copy_uuid helper The checksumming helpers are loosely based on similar ones in sctp, all other bits come from Dave Chinner. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/Kconfig | 1 + fs/xfs/uuid.h | 6 +++++ fs/xfs/xfs_cksum.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_sb.h | 7 ++++++ 5 files changed, 78 insertions(+) create mode 100644 fs/xfs/xfs_cksum.h diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 6100ec0fa1d4..5a7ffe54f5d5 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -2,6 +2,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS + select LIBCRC32C help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h index 4732d71262cc..104db0f3bed6 100644 --- a/fs/xfs/uuid.h +++ b/fs/xfs/uuid.h @@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + #endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h new file mode 100644 index 000000000000..fad1676ad8cd --- /dev/null +++ b/fs/xfs/xfs_cksum.h @@ -0,0 +1,63 @@ +#ifndef _XFS_CKSUM_H +#define _XFS_CKSUM_H 1 + +#define XFS_CRC_SEED (~(__uint32_t)0) + +/* + * Calculate the intermediate checksum for a buffer that has the CRC field + * inside it. The offset of the 32bit crc fields is passed as the + * cksum_offset parameter. + */ +static inline __uint32_t +xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t zero = 0; + __uint32_t crc; + + /* Calculate CRC up to the checksum. */ + crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); + + /* Skip checksum field */ + crc = crc32c(crc, &zero, sizeof(__u32)); + + /* Calculate the rest of the CRC. */ + return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)], + length - (cksum_offset + sizeof(__be32))); +} + +/* + * Convert the intermediate checksum to the final ondisk format. + * + * The CRC32c calculation uses LE format even on BE machines, but returns the + * result in host endian format. Hence we need to byte swap it back to LE format + * so that it is consistent on disk. + */ +static inline __le32 +xfs_end_cksum(__uint32_t crc) +{ + return ~cpu_to_le32(crc); +} + +/* + * Helper to generate the checksum for a buffer. + */ +static inline void +xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); +} + +/* + * Helper to verify the checksum for a buffer. + */ +static inline int +xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); +} + +#endif /* _XFS_CKSUM_H */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 0a134ca5211c..fe7e4df85a7b 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index f429d9d5d325..a05b45175fb0 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -81,6 +81,7 @@ struct xfs_mount; #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ #define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ #define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ @@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); } +static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +{ + return (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT)); +} + /* * end of superblock version macros */ From 0e446be44806240c779666591bb9e8cb0e86a50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Nov 2012 22:54:24 +1100 Subject: [PATCH 72/78] xfs: add CRC checks to the log Implement CRCs for the log buffers. We re-use a field in struct xlog_rec_header that was used for a weak checksum of the log buffer payload in debug builds before. The new checksumming uses the crc32c checksum we will use elsewhere in XFS, and also protects the record header and addition cycle data. Due to this there are some interesting changes in xlog_sync, as we need to do the cycle wrapping for the split buffer case much earlier, as we would touch the buffer after generating the checksum otherwise. The CRC calculation is always enabled, even for non-CRC filesystems, as adding this CRC does not change the log format. On non-CRC filesystems, only issue an alert if a CRC mismatch is found and allow recovery to continue - this will act as an indicator that log recovery problems are a result of log corruption. On CRC enabled filesystems, however, log recovery will fail. Note that existing debug kernels will write a simple checksum value to the log, so the first time this is run on a filesystem taht was last used on a debug kernel it will through CRC mismatch warning errors. These can be ignored. Initially based on a patch from Dave Chinner, then modified significantly by Christoph Hellwig. Modified again by Dave Chinner to get to this version. Signed-off-by: Christoph Hellwig Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 132 ++++++++++++++++++++++++++++++++------- fs/xfs/xfs_log_priv.h | 11 ++-- fs/xfs/xfs_log_recover.c | 130 ++++++++++++++++++-------------------- 3 files changed, 175 insertions(+), 98 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1d6d2ee08495..c6d6e136ba77 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -35,6 +35,7 @@ #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_fsops.h" +#include "xfs_cksum.h" kmem_zone_t *xfs_log_ticket_zone; @@ -1489,6 +1490,84 @@ xlog_grant_push_ail( xfs_ail_push(log->l_ailp, threshold_lsn); } +/* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__be32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + /* * The bdstrat callback function for log bufs. This gives us a central * place to trap bufs in case we get hit by a log I/O error and need to @@ -1549,7 +1628,6 @@ xlog_sync( struct xlog *log, struct xlog_in_core *iclog) { - xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; int i; uint count; /* byte count of bwrite */ @@ -1558,6 +1636,7 @@ xlog_sync( int split = 0; /* split write into two regions */ int error; int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; XFS_STATS_INC(xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); @@ -1588,13 +1667,10 @@ xlog_sync( xlog_pack_data(log, iclog, roundoff); /* real byte length */ - if (v2) { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset + roundoff); - } else { - iclog->ic_header.h_len = - cpu_to_be32(iclog->ic_offset); - } + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); @@ -1603,12 +1679,36 @@ xlog_sync( /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; /* split into 2 writes */ + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } } else { iclog->ic_bwritecnt = 1; } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); @@ -1662,19 +1762,6 @@ xlog_sync( bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = bp->b_addr; - /* - * Bump the cycle numbers at the start of each block - * since this part of the buffer is at the start of - * a new cycle. Watch out for the header magic number - * case, though. - */ - for (i = 0; i < split; i += BBSIZE) { - be32_add_cpu((__be32 *)dptr, 1); - if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM) - be32_add_cpu((__be32 *)dptr, 1); - dptr += BBSIZE; - } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1691,7 +1778,6 @@ xlog_sync( return 0; } /* xlog_sync */ - /* * Deallocate a log structure */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 9a4e0e5ec322..dc3498bf17c2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i) /* * Flags for log structure */ -#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ #define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being @@ -291,7 +290,7 @@ typedef struct xlog_rec_header { __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ __be64 h_lsn; /* lsn of this LR : 8 */ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - __be32 h_chksum; /* may not be used; non-zero if used : 4 */ + __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; @@ -555,11 +554,9 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); -extern void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int); + +extern __be32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 931e8e23f192..9c3651c9e75b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -41,6 +41,7 @@ #include "xfs_trans_priv.h" #include "xfs_quota.h" #include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -3216,80 +3217,58 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - struct xlog *log, - struct xlog_in_core *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - struct xlog *log, - struct xlog_in_core *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; + __be32 crc; - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.\n", + be32_to_cpu(rhead->h_crc), + be32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } + + return 0; } -STATIC void +STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, xfs_caddr_t dp, struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3306,6 +3285,8 @@ xlog_unpack_data( dp += BBSIZE; } } + + return 0; } STATIC int @@ -3437,9 +3418,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3549,9 +3534,14 @@ xlog_do_recovery_pass( if (error) goto bread_err2; } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3576,9 +3566,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } From 0acba3cd018c51e529a6f008c092629ad2e9e0a4 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Mon, 26 Nov 2012 15:25:05 +0900 Subject: [PATCH 73/78] xfs: Remove the description of nodelaylog mount option from xfs.txt nodelaylog mount option is removed by commit 93b8a585. But there still be the description about it in the xfs document. This patch removes it. Signed-off-by: Satoru Takeuchi Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- Documentation/filesystems/xfs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 17187750270a..3e4b3dd1e046 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -43,7 +43,7 @@ When mounting an XFS filesystem, the following options are accepted. Issue command to let the block device reclaim space freed by the filesystem. This is useful for SSD devices, thinly provisioned LUNs and virtual machine images, but may have a performance - impact. This option is incompatible with the nodelaylog option. + impact. dmapi Enable the DMAPI (Data Management API) event callouts. From 7c4cebe8e02dd0b0e655605442bbe9268db9ed4f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 23 Nov 2012 14:24:23 +1100 Subject: [PATCH 74/78] xfs: inode allocation should use unmapped buffers. Inode buffers do not need to be mapped as inodes are read or written directly from/to the pages underlying the buffer. This fixes a regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the default behaviour"). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_ialloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 2d6495eaaa34..a815412eab80 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,7 +200,8 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, 0); + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); if (!fbuf) return ENOMEM; /* From ef9d873344ff9f5084eacb9f3735982314dfda9e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 29 Nov 2012 15:26:33 +1100 Subject: [PATCH 75/78] xfs: byte range granularity for XFS_IOC_ZERO_RANGE XFS_IOC_ZERO_RANGE simply does not work properly for non page cache aligned ranges. Neither test 242 or 290 exercise this correctly, so the behaviour is completely busted even though the tests pass. Fix it to support full byte range granularity as was originally intended for this ioctl. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_vnodeops.c | 96 +++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_vnodeops.h | 1 + 3 files changed, 77 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 400b187595bb..67284edb84d7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -86,7 +86,7 @@ xfs_rw_ilock_demote( * valid before the operation, it will be read from disk before * being partially zeroed. */ -STATIC int +int xfs_iozero( struct xfs_inode *ip, /* inode */ loff_t pos, /* offset in file */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 26880793feca..d95f565a390e 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2095,6 +2095,73 @@ xfs_free_file_space( return error; } + +STATIC int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int attr_flags) +{ + struct xfs_mount *mp = ip->i_mount; + uint granularity; + xfs_off_t start_boundary; + xfs_off_t end_boundary; + int error; + + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + + /* + * Round the range of extents we are going to convert inwards. If the + * offset is aligned, then it doesn't get changed so we zero from the + * start of the block offset points to. + */ + start_boundary = round_up(offset, granularity); + end_boundary = round_down(offset + len, granularity); + + ASSERT(start_boundary >= offset); + ASSERT(end_boundary <= offset + len); + + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (start_boundary < end_boundary - 1) { + /* punch out the page cache over the conversion range */ + truncate_pagecache_range(VFS_I(ip), start_boundary, + end_boundary - 1); + /* convert the blocks */ + error = xfs_alloc_file_space(ip, start_boundary, + end_boundary - start_boundary - 1, + XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT, + attr_flags); + if (error) + goto out_unlock; + + /* We've handled the interior of the range, now for the edges */ + if (start_boundary != offset) + error = xfs_iozero(ip, offset, start_boundary - offset); + if (error) + goto out_unlock; + + if (end_boundary != offset + len) + error = xfs_iozero(ip, end_boundary, + offset + len - end_boundary); + + } else { + /* + * It's either a sub-granularity range or the range spanned lies + * partially across two adjacent blocks. + */ + error = xfs_iozero(ip, offset, len); + } + +out_unlock: + if (!(attr_flags & XFS_ATTR_NOLOCK)) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + +} + /* * xfs_change_file_space() * This routine allocates or frees disk space for the given file. @@ -2120,10 +2187,8 @@ xfs_change_file_space( xfs_fsize_t fsize; int setprealloc; xfs_off_t startoffset; - xfs_off_t end; xfs_trans_t *tp; struct iattr iattr; - int prealloc_type; if (!S_ISREG(ip->i_d.di_mode)) return XFS_ERROR(EINVAL); @@ -2172,31 +2237,20 @@ xfs_change_file_space( startoffset = bf->l_start; fsize = XFS_ISIZE(ip); - /* - * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve - * file space. - * These calls do NOT zero the data space allocated to the file, - * nor do they change the file size. - * - * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file - * space. - * These calls cause the new file data to be zeroed and the file - * size to be changed. - */ setprealloc = clrprealloc = 0; - prealloc_type = XFS_BMAPI_PREALLOC; - switch (cmd) { case XFS_IOC_ZERO_RANGE: - prealloc_type |= XFS_BMAPI_CONVERT; - end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1; - if (startoffset <= end) - truncate_pagecache_range(VFS_I(ip), startoffset, end); - /* FALLTHRU */ + error = xfs_zero_file_space(ip, startoffset, bf->l_len, + attr_flags); + if (error) + return error; + setprealloc = 1; + break; + case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - prealloc_type, attr_flags); + XFS_BMAPI_PREALLOC, attr_flags); if (error) return error; setprealloc = 1; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 91a03fa3814f..5163022d9808 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -49,6 +49,7 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); +int xfs_iozero(struct xfs_inode *, loff_t, size_t); int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool); From 437a255aa23766666aec78af63be4c253faa8d57 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:00 +1100 Subject: [PATCH 76/78] xfs: fix direct IO nested transaction deadlock. The direct IO path can do a nested transaction reservation when writing past the EOF. The first transaction is the append transaction for setting the filesize at IO completion, but we can also need a transaction for allocation of blocks. If the log is low on space due to reservations and small log, the append transaction can be granted after wating for space as the only active transaction in the system. This then attempts a reservation for an allocation, which there isn't space in the log for, and the reservation sleeps. The result is that there is nothing left in the system to wake up all the processes waiting for log space to come free. The stack trace that shows this deadlock is relatively innocuous: xlog_grant_head_wait xlog_grant_head_check xfs_log_reserve xfs_trans_reserve xfs_iomap_write_direct __xfs_get_blocks xfs_get_blocks_direct do_blockdev_direct_IO __blockdev_direct_IO xfs_vm_direct_IO generic_file_direct_write xfs_file_dio_aio_writ xfs_file_aio_write do_sync_write vfs_write This was discovered on a filesystem with a log of only 10MB, and a log stripe unit of 256k whih increased the base reservations by 512k. Hence a allocation transaction requires 1.2MB of log space to be available instead of only 260k, and so greatly increased the chance that there wouldn't be enough log space available for the nested transaction to succeed. The key to reproducing it is this mkfs command: mkfs.xfs -f -d agcount=16,su=256k,sw=12 -l su=256k,size=2560b $SCRATCH_DEV The test case was a 1000 fsstress processes running with random freeze and unfreezes every few seconds. Thanks to Eryu Guan (eguan@redhat.com) for writing the test that found this on a system with a somewhat unique default configuration.... cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Andrew Dahl Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 83 +++++++++++++++++------------------------------ fs/xfs/xfs_log.c | 3 +- 2 files changed, 32 insertions(+), 54 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 71361da1f77c..4111a40ebe1a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* - * We will pass freeze protection with a transaction. So tell lockdep + * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], @@ -149,11 +149,13 @@ xfs_setfilesize( xfs_fsize_t isize; /* - * The transaction was allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction - * manually. + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); @@ -187,7 +189,8 @@ xfs_finish_ioend( if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -205,15 +208,6 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; - if (ioend->io_append_trans) { - /* - * We've got freeze protection passed with the transaction. - * Tell lockdep about it. - */ - rwsem_acquire_read( - &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -226,35 +220,31 @@ xfs_end_io( * range to normal written extens after the data I/O has finished. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { - /* - * For buffered I/O we never preallocate a transaction when - * doing the unwritten extent conversion, but for direct I/O - * we do not know if we are converting an unwritten extent - * or not at the point where we preallocate the transaction. - */ - if (ioend->io_append_trans) { - ASSERT(ioend->io_isdirect); - - current_set_flags_nested( - &ioend->io_append_trans->t_pflags, PF_FSTRANS); - xfs_trans_cancel(ioend->io_append_trans, 0); - } - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - if (error) { - ioend->io_error = -error; + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { + /* + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. + */ + error = xfs_setfilesize_trans_alloc(ioend); + if (error) goto done; - } + error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { error = xfs_setfilesize(ioend); - if (error) - ioend->io_error = -error; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: + if (error) + ioend->io_error = -error; xfs_destroy_ioend(ioend); } @@ -1432,25 +1422,21 @@ xfs_vm_direct_IO( size_t size = iov_length(iov, nr_segs); /* - * We need to preallocate a transaction for a size update - * here. In the case that this write both updates the size - * and converts at least on unwritten extent we will cancel - * the still clean transaction after the I/O has finished. + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. */ iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) { - ret = xfs_setfilesize_trans_alloc(ioend); - if (ret) - goto out_destroy_ioend; + if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - } ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, 0); if (ret != -EIOCBQUEUED && iocb->private) - goto out_trans_cancel; + goto out_destroy_ioend; } else { ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, @@ -1460,15 +1446,6 @@ xfs_vm_direct_IO( return ret; -out_trans_cancel: - if (ioend->io_append_trans) { - current_set_flags_nested(&ioend->io_append_trans->t_pflags, - PF_FSTRANS); - rwsem_acquire_read( - &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - xfs_trans_cancel(ioend->io_append_trans, 0); - } out_destroy_ioend: xfs_destroy_ioend(ioend); return ret; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c6d6e136ba77..c49e2c12dba4 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -460,7 +460,8 @@ xfs_log_reserve( tic->t_trans_type = t_type; *ticp = tic; - xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); trace_xfs_log_reserve(log, tic); From b870553cdecb26d5291af09602352b763e323df2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:02 +1100 Subject: [PATCH 77/78] xfs: fix stray dquot unlock when reclaiming dquots When we fail to get a dquot lock during reclaim, we jump to an error handler that unlocks the dquot. This is wrong as we didn't lock the dquot, and unlocking it means who-ever is holding the lock has had it silently taken away, and hence it results in a lock imbalance. Found by inspection while modifying the code for the numa-lru patchset. This fixes a random hang I've been seeing on xfstest 232 for the past several months. cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_qm.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e6a0af0ba007..60eff4763156 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1456,7 +1456,7 @@ xfs_qm_dqreclaim_one( int error; if (!xfs_dqlock_nowait(dqp)) - goto out_busy; + goto out_move_tail; /* * This dquot has acquired a reference in the meantime remove it from @@ -1479,7 +1479,7 @@ xfs_qm_dqreclaim_one( * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) - goto out_busy; + goto out_unlock_move_tail; if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; @@ -1490,7 +1490,7 @@ xfs_qm_dqreclaim_one( if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); - goto out_busy; + goto out_unlock_move_tail; } xfs_buf_delwri_queue(bp, buffer_list); @@ -1499,7 +1499,7 @@ xfs_qm_dqreclaim_one( * Give the dquot another try on the freelist, as the * flushing will take some time. */ - goto out_busy; + goto out_unlock_move_tail; } xfs_dqfunlock(dqp); @@ -1518,14 +1518,13 @@ xfs_qm_dqreclaim_one( XFS_STATS_INC(xs_qm_dqreclaims); return; -out_busy: - xfs_dqunlock(dqp); - /* * Move the dquot to the tail of the list so that we don't spin on it. */ +out_unlock_move_tail: + xfs_dqunlock(dqp); +out_move_tail: list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(xs_qm_dqreclaim_misses); } From f9668a09e32ac6d2aa22f44cc310e430a8f4a40f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 28 Nov 2012 13:01:03 +1100 Subject: [PATCH 78/78] xfs: fix sparse reported log CRC endian issue Not a bug as such, just warning noise from the xlog_cksum() returning a __be32 type when it should be returning a __le32 type. On Wed, Nov 28, 2012 at 08:30:59AM -0500, Christoph Hellwig wrote: > But why are we storing the crc field little endian while all other on > disk formats are big endian? (And yes I realize it might as well have > been me who did that back in the idea, but I still have no idea why) Because the CRC always returns the calcuation LE format, even on BE systems. So rather than always having to byte swap it everywhere and have all the force casts and anootations for sparse, it seems simpler to just make it a __le32 everywhere.... Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_priv.h | 2 +- fs/xfs/xfs_log_recover.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c49e2c12dba4..46bd9d52ab51 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1538,7 +1538,7 @@ xlog_pack_data( * This is a little more complicated than it should be because the various * headers and the actual data are non-contiguous. */ -__be32 +__le32 xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index dc3498bf17c2..16d8d12ea3b4 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -555,7 +555,7 @@ extern int xlog_recover_finish( struct xlog *log); -extern __be32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9c3651c9e75b..96fcbb85ff83 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3233,15 +3233,15 @@ xlog_unpack_data_crc( xfs_caddr_t dp, struct xlog *log) { - __be32 crc; + __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.\n", - be32_to_cpu(rhead->h_crc), - be32_to_cpu(crc)); + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); xfs_hex_dump(dp, 32); }