Changes since last update:

- Various cleanups - Livelock fixes for eofblocks scanning - Improved input verification for on-disk metadata - Fix races in the copy on write remap mechanism - Fix buffer io error timeout controls - Streamlining of directio copy on write - Asynchronous discard support - Fix asserts when splitting delalloc reservations - Don't bloat bmbt when right shifting extents - Inode alignment fixes for 32k block sizes -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABCgAGBQJYp85wAAoJEPh/dxk0SrTr5HgP/jcx/oI+ap/NaXMi1Q8K65mh C3gf27cgUxtdGnEO5KRUE1Jyscuu4ZpzugDdLQISwR55kesT5FU0xpgbsfiICc86 dxLAhg8auwpTfHV+96Do2hfpO3IhYoBC2w5jo32+C+SaQUqTdPixncZukX89tjyP HOFLrQnpc336hCO2rv1Q9hSkD6IUCkSAtk+Dh1xMvbsmKFLGdmkTdqUQfl1U4YnV 2S98k9QSRdiVyzj3lAGOy+IU9aTcPX/PptMEYaQZEaod5WWNjy91lQZNM6zRc4QW 8P199yiH6CQa2vESO2SV72cJ40WihM1KQXqnrlJjAMGQ7mMGTGJcTwxhuZYUbDYZ cuk6bAUaijt/PzfmydJKlcH8vFerX4aU4CGkxPU0nph0iTR5kxYlIAMmFw2cdRzf Iar3SBb8Pc9jiNnEZMFsQ0Fd9hNk9rNoUSpKqm4FtSRocU6JjmpAdPqNYdTVKc2l 2EY7JMo0xCaTVC1WT6sE2NsxsFvm0R7H6HHG2vMFIMNkhI24GRijIXH6dQlaGCQJ 5oTHrSM7503qPlEQNsxF7zI02LpJT+duf+2ODw/FSjA1z/TWwOUYYUrPUOyQNdzP NrRnMa6LWsEehkuvz2FFko8PKXD55lTuUP1KdjigjqKp8Jzkc/PP+uvuwF5vUFfd pWRvE5m/NePWBZetbL3Q =Ga1F -----END PGP SIGNATURE----- Merge tag 'xfs-4.11-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux Pull xfs updates from Darrick Wong: "Here are the XFS changes for 4.11. We aren't introducing any major features in this release cycle except for this being the first merge window I've managed on my own. :) Changes since last update: - Various cleanups - Livelock fixes for eofblocks scanning - Improved input verification for on-disk metadata - Fix races in the copy on write remap mechanism - Fix buffer io error timeout controls - Streamlining of directio copy on write - Asynchronous discard support - Fix asserts when splitting delalloc reservations - Don't bloat bmbt when right shifting extents - Inode alignment fixes for 32k block sizes" * tag 'xfs-4.11-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (39 commits) xfs: remove XFS_ALLOCTYPE_ANY_AG and XFS_ALLOCTYPE_START_AG xfs: simplify xfs_rtallocate_extent xfs: tune down agno asserts in the bmap code xfs: Use xfs_icluster_size_fsb() to calculate inode chunk alignment xfs: don't reserve blocks for right shift transactions xfs: fix len comparison in xfs_extent_busy_trim xfs: fix uninitialized variable in _reflink_convert_cow xfs: split indlen reservations fairly when under reserved xfs: handle indlen shortage on delalloc extent merge xfs: resurrect debug mode drop buffered writes mechanism xfs: clear delalloc and cache on buffered write failure xfs: don't block the log commit handler for discards xfs: improve busy extent sorting xfs: improve handling of busy extents in the low-level allocator xfs: don't fail xfs_extent_busy allocation xfs: correct null checks and error processing in xfs_initialize_perag xfs: update ctime and mtime on clone destinatation inodes xfs: allocate direct I/O COW blocks in iomap_begin xfs: go straight to real allocations for direct I/O COW writes xfs: return the converted extent in __xfs_reflink_convert_cow ...
2017-02-22 18:05:23 -08:00 · 2017-02-22 18:05:23 -08:00 · a27fcb0cd1
commit a27fcb0cd1
parent 7d91de7443 8d242e932f
51 changed files with 928 additions and 646 deletions
--- a/fs/dax.c
+++ b/fs/dax.c
@ -1079,7 +1079,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 */
 ssize_t
 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
@ -1127,7 +1127,7 @@ static int dax_fault_return(int error)
 * necessary locking for the page fault to proceed successfully.
 */
 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			struct iomap_ops *ops)
+			const struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
@ -1326,7 +1326,7 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 }
 int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+		pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = address & PMD_MASK;
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@ -814,7 +814,7 @@ extern const struct file_operations ext2_file_operations;
 /* inode.c */
 extern const struct address_space_operations ext2_aops;
 extern const struct address_space_operations ext2_nobh_aops;
-extern struct iomap_ops ext2_iomap_ops;
+extern const struct iomap_ops ext2_iomap_ops;
 /* namei.c */
 extern const struct inode_operations ext2_dir_inode_operations;
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@ -842,13 +842,13 @@ ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 	return 0;
 }
-struct iomap_ops ext2_iomap_ops = {
+const struct iomap_ops ext2_iomap_ops = {
 	.iomap_begin		= ext2_iomap_begin,
 	.iomap_end		= ext2_iomap_end,
 };
 #else
 /* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
-struct iomap_ops ext2_iomap_ops;
+const struct iomap_ops ext2_iomap_ops;
 #endif /* CONFIG_FS_DAX */
 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@ -3244,7 +3244,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 	}
 }
-extern struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_ops;
 #endif	/* __KERNEL__ */
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@ -3450,7 +3450,7 @@ orphan_del:
 	return ret;
 }
-struct iomap_ops ext4_iomap_ops = {
+const struct iomap_ops ext4_iomap_ops = {
 	.iomap_begin		= ext4_iomap_begin,
 	.iomap_end		= ext4_iomap_end,
 };
--- a/fs/internal.h
+++ b/fs/internal.h
@ -182,7 +182,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
 		void *data, struct iomap *iomap);
 loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
-		unsigned flags, struct iomap_ops *ops, void *data,
+		unsigned flags, const struct iomap_ops *ops, void *data,
 		iomap_actor_t actor);
 /* direct-io.c: */
--- a/fs/iomap.c
+++ b/fs/iomap.c
@ -41,7 +41,7 @@
 */
 loff_t
 iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
-		struct iomap_ops *ops, void *data, iomap_actor_t actor)
+		const struct iomap_ops *ops, void *data, iomap_actor_t actor)
 {
 	struct iomap iomap = { 0 };
 	loff_t written = 0, ret;
@ -235,7 +235,7 @@ again:
 ssize_t
 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
@ -318,7 +318,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 int
 iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	loff_t ret;
@ -398,7 +398,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
 int
 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	loff_t ret;
@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
 int
 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	unsigned blocksize = (1 << inode->i_blkbits);
 	unsigned off = pos & (blocksize - 1);
@ -446,7 +446,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 }
 int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
@ -545,7 +545,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 }
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
-		loff_t start, loff_t len, struct iomap_ops *ops)
+		loff_t start, loff_t len, const struct iomap_ops *ops)
 {
 	struct fiemap_ctx ctx;
 	loff_t ret;
@ -839,8 +839,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 }
 ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		iomap_dio_end_io_t end_io)
+		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = file_inode(iocb->ki_filp);
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@ -221,20 +221,22 @@ xfs_alloc_get_rec(
 * Compute aligned version of the found extent.
 * Takes alignment and min length into account.
 */
-STATIC void
+STATIC bool
 xfs_alloc_compute_aligned(
 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
 	xfs_agblock_t	foundbno,	/* starting block in found extent */
 	xfs_extlen_t	foundlen,	/* length in found extent */
 	xfs_agblock_t	*resbno,	/* result block number */
-	xfs_extlen_t	*reslen)	/* result length */
+	xfs_extlen_t	*reslen,	/* result length */
 	unsigned	*busy_gen)
 {
-	xfs_agblock_t	bno;
+	xfs_agblock_t	bno = foundbno;
-	xfs_extlen_t	len;
+	xfs_extlen_t	len = foundlen;
 	xfs_extlen_t	diff;
 	bool		busy;
 	/* Trim busy sections out of found extent */
-	xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
 	/*
 	 * If we have a largish extent that happens to start before min_agbno,
@ -259,6 +261,8 @@ xfs_alloc_compute_aligned(
 		*resbno = bno;
 		*reslen = len;
 	}
 	return busy;
 }
 /*
@ -737,10 +741,11 @@ xfs_alloc_ag_vextent_exact(
 	int		error;
 	xfs_agblock_t	fbno;	/* start block of found extent */
 	xfs_extlen_t	flen;	/* length of found extent */
-	xfs_agblock_t	tbno;	/* start block of trimmed extent */
+	xfs_agblock_t	tbno;	/* start block of busy extent */
-	xfs_extlen_t	tlen;	/* length of trimmed extent */
+	xfs_extlen_t	tlen;	/* length of busy extent */
-	xfs_agblock_t	tend;	/* end block of trimmed extent */
+	xfs_agblock_t	tend;	/* end block of busy extent */
 	int		i;	/* success/failure of operation */
 	unsigned	busy_gen;
 	ASSERT(args->alignment == 1);
@ -773,7 +778,9 @@ xfs_alloc_ag_vextent_exact(
 	/*
 	 * Check for overlapping busy extents.
 	 */
-	xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+	tbno = fbno;
 	tlen = flen;
 	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
 	/*
 	 * Give up if the start of the extent is busy, or the freespace isn't
@ -853,6 +860,7 @@ xfs_alloc_find_best_extent(
 	xfs_agblock_t		sdiff;
 	int			error;
 	int			i;
 	unsigned		busy_gen;
 	/* The good extent is perfect, no need to  search. */
 	if (!gdiff)
@ -866,7 +874,8 @@ xfs_alloc_find_best_extent(
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+		xfs_alloc_compute_aligned(args, *sbno, *slen,
 				sbnoa, slena, &busy_gen);
 		/*
 		 * The good extent is closer than this one.
@ -955,7 +964,8 @@ xfs_alloc_ag_vextent_near(
 	xfs_extlen_t	ltlena;		/* aligned ... */
 	xfs_agblock_t	ltnew;		/* useful start bno of left side */
 	xfs_extlen_t	rlen;		/* length of returned extent */
-	int		forced = 0;
+	bool		busy;
 	unsigned	busy_gen;
 #ifdef DEBUG
 	/*
 	 * Randomly don't execute the first algorithm.
@ -982,6 +992,7 @@ restart:
 	ltlen = 0;
 	gtlena = 0;
 	ltlena = 0;
 	busy = false;
 	/*
 	 * Get a cursor for the by-size btree.
@ -1064,8 +1075,8 @@ restart:
 			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+			busy = xfs_alloc_compute_aligned(args, ltbno, ltlen,
-						  &ltbnoa, &ltlena);
+					&ltbnoa, &ltlena, &busy_gen);
 			if (ltlena < args->minlen)
 				continue;
 			if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
@ -1183,8 +1194,8 @@ restart:
 			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+			busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
-						  &ltbnoa, &ltlena);
+					&ltbnoa, &ltlena, &busy_gen);
 			if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
 				break;
 			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@ -1199,8 +1210,8 @@ restart:
 			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, gtbno, gtlen,
+			busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
-						  &gtbnoa, &gtlena);
+					&gtbnoa, &gtlena, &busy_gen);
 			if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
 				break;
 			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@ -1261,9 +1272,9 @@ restart:
 	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		if (!forced++) {
+		if (busy) {
 			trace_xfs_alloc_near_busy(args);
-			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
 			goto restart;
 		}
 		trace_xfs_alloc_size_neither(args);
@ -1344,7 +1355,8 @@ xfs_alloc_ag_vextent_size(
 	int		i;		/* temp status variable */
 	xfs_agblock_t	rbno;		/* returned block number */
 	xfs_extlen_t	rlen;		/* length of returned extent */
-	int		forced = 0;
+	bool		busy;
 	unsigned	busy_gen;
 restart:
 	/*
@ -1353,6 +1365,7 @@ restart:
 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
 		args->agno, XFS_BTNUM_CNT);
 	bno_cur = NULL;
 	busy = false;
 	/*
 	 * Look for an entry >= maxlen+alignment-1 blocks.
@ -1362,14 +1375,13 @@ restart:
 		goto error0;
 	/*
-	 * If none or we have busy extents that we cannot allocate from, then
+	 * If none then we have to settle for a smaller extent. In the case that
-	 * we have to settle for a smaller extent. In the case that there are
+	 * there are no large extents, this will return the last entry in the
-	 * no large extents, this will return the last entry in the tree unless
+	 * tree unless the tree is empty. In the case that there are only busy
-	 * the tree is empty. In the case that there are only busy large
+	 * large extents, this will return the largest small extent unless there
 	 * extents, this will return the largest small extent unless there
 	 * are no smaller extents available.
 	 */
-	if (!i || forced > 1) {
+	if (!i) {
 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
 						   &fbno, &flen, &i);
 		if (error)
@ -1380,13 +1392,11 @@ restart:
 			return 0;
 		}
 		ASSERT(i == 1);
-		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
 				&rlen, &busy_gen);
 	} else {
 		/*
 		 * Search for a non-busy extent that is large enough.
 		 * If we are at low space, don't check, or if we fall of
 		 * the end of the btree, turn off the busy check and
 		 * restart.
 		 */
 		for (;;) {
 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
@ -1394,8 +1404,8 @@ restart:
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, fbno, flen,
+			busy = xfs_alloc_compute_aligned(args, fbno, flen,
-						  &rbno, &rlen);
+					&rbno, &rlen, &busy_gen);
 			if (rlen >= args->maxlen)
 				break;
@ -1407,18 +1417,13 @@ restart:
 				/*
 				 * Our only valid extents must have been busy.
 				 * Make it unbusy by forcing the log out and
-				 * retrying. If we've been here before, forcing
+				 * retrying.
 				 * the log isn't making the extents available,
 				 * which means they have probably been freed in
 				 * this transaction.  In that case, we have to
 				 * give up on them and we'll attempt a minlen
 				 * allocation the next time around.
 				 */
 				xfs_btree_del_cursor(cnt_cur,
 						     XFS_BTREE_NOERROR);
 				trace_xfs_alloc_size_busy(args);
-				if (!forced++)
+				xfs_extent_busy_flush(args->mp,
-					xfs_log_force(args->mp, XFS_LOG_SYNC);
+							args->pag, busy_gen);
 				goto restart;
 			}
 		}
@ -1454,8 +1459,8 @@ restart:
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 			if (flen < bestrlen)
 				break;
-			xfs_alloc_compute_aligned(args, fbno, flen,
+			busy = xfs_alloc_compute_aligned(args, fbno, flen,
-						  &rbno, &rlen);
+					&rbno, &rlen, &busy_gen);
 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
 			XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
 				(rlen <= flen && rbno + rlen <= fbno + flen),
@ -1484,10 +1489,10 @@ restart:
 	 */
 	args->len = rlen;
 	if (rlen < args->minlen) {
-		if (!forced++) {
+		if (busy) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 			trace_xfs_alloc_size_busy(args);
-			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
 			goto restart;
 		}
 		goto out_nominleft;
@ -2659,21 +2664,11 @@ xfs_alloc_vextent(
 		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
 		args->type = XFS_ALLOCTYPE_NEAR_BNO;
 		/* FALLTHROUGH */
 	case XFS_ALLOCTYPE_ANY_AG:
 	case XFS_ALLOCTYPE_START_AG:
 	case XFS_ALLOCTYPE_FIRST_AG:
 		/*
 		 * Rotate through the allocation groups looking for a winner.
 		 */
-		if (type == XFS_ALLOCTYPE_ANY_AG) {
+		if (type == XFS_ALLOCTYPE_FIRST_AG) {
 			/*
 			 * Start with the last place we left off.
 			 */
 			args->agno = sagno = (mp->m_agfrotor / rotorstep) %
 					mp->m_sb.sb_agcount;
 			args->type = XFS_ALLOCTYPE_THIS_AG;
 			flags = XFS_ALLOC_FLAG_TRYLOCK;
 		} else if (type == XFS_ALLOCTYPE_FIRST_AG) {
 			/*
 			 * Start with allocation group given by bno.
 			 */
@ -2682,8 +2677,6 @@ xfs_alloc_vextent(
 			sagno = 0;
 			flags = 0;
 		} else {
 			if (type == XFS_ALLOCTYPE_START_AG)
 				args->type = XFS_ALLOCTYPE_THIS_AG;
 			/*
 			 * Start with the given allocation group.
 			 */
@ -2751,7 +2744,7 @@ xfs_alloc_vextent(
 			}
 			xfs_perag_put(args->pag);
 		}
-		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+		if (bump_rotor) {
 			if (args->agno == sagno)
 				mp->m_agfrotor = (mp->m_agfrotor + 1) %
 					(mp->m_sb.sb_agcount * rotorstep);
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@ -29,9 +29,7 @@ extern struct workqueue_struct *xfs_alloc_wq;
 /*
 * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
 */
 #define XFS_ALLOCTYPE_ANY_AG	0x01	/* allocate anywhere, use rotor */
 #define XFS_ALLOCTYPE_FIRST_AG	0x02	/* ... start at ag 0 */
 #define XFS_ALLOCTYPE_START_AG	0x04	/* anywhere, start in this a.g. */
 #define XFS_ALLOCTYPE_THIS_AG	0x08	/* anywhere in this a.g. */
 #define XFS_ALLOCTYPE_START_BNO	0x10	/* near this block else anywhere */
 #define XFS_ALLOCTYPE_NEAR_BNO	0x20	/* in this a.g. and near this block */
@ -41,9 +39,7 @@ extern struct workqueue_struct *xfs_alloc_wq;
 typedef unsigned int xfs_alloctype_t;
 #define XFS_ALLOC_TYPES \
 	{ XFS_ALLOCTYPE_ANY_AG,		"ANY_AG" }, \
 	{ XFS_ALLOCTYPE_FIRST_AG,	"FIRST_AG" }, \
 	{ XFS_ALLOCTYPE_START_AG,	"START_AG" }, \
 	{ XFS_ALLOCTYPE_THIS_AG,	"THIS_AG" }, \
 	{ XFS_ALLOCTYPE_START_BNO,	"START_BNO" }, \
 	{ XFS_ALLOCTYPE_NEAR_BNO,	"NEAR_BNO" }, \
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@ -740,15 +740,9 @@ xfs_bmap_extents_to_btree(
 	 * Fill in the root.
 	 */
 	block = ifp->if_broot;
-	if (xfs_sb_version_hascrc(&mp->m_sb))
+	xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
-		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
 				 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
 				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
 	else
 		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
 				 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
 				 XFS_BTREE_LONG_PTRS);
 	/*
 	 * Need a cursor.  Can't allocate until bb_level is filled in.
 	 */
@ -804,9 +798,7 @@ try_another_ag:
 	 */
 	ASSERT(args.fsbno != NULLFSBLOCK);
 	ASSERT(*firstblock == NULLFSBLOCK ||
-	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+	       args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
 	       (dfops->dop_low &&
 		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
 	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
 	cur->bc_private.b.allocated++;
 	ip->i_d.di_nblocks++;
@ -817,13 +809,8 @@ try_another_ag:
 	 */
 	abp->b_ops = &xfs_bmbt_buf_ops;
 	ablock = XFS_BUF_TO_BLOCK(abp);
-	if (xfs_sb_version_hascrc(&mp->m_sb))
+	xfs_btree_init_block_int(mp, ablock, abp->b_bn,
-		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+				XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
 				XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
 				XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
 	else
 		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
 				XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
 				XFS_BTREE_LONG_PTRS);
 	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
@ -1278,7 +1265,6 @@ xfs_bmap_read_extents(
 	/* REFERENCED */
 	xfs_extnum_t		room;	/* number of entries there's room for */
 	bno = NULLFSBLOCK;
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
@ -1291,9 +1277,7 @@ xfs_bmap_read_extents(
 	ASSERT(level > 0);
 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
 	bno = be64_to_cpu(*pp);
-	ASSERT(bno != NULLFSBLOCK);
+
 	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
 	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 	/*
 	 * Go down the tree until leaf level is reached, following the first
 	 * pointer (leftmost) at each level.
@ -1864,6 +1848,7 @@ xfs_bmap_add_extent_delay_real(
 		 */
 		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
 		xfs_bmbt_set_state(ep, new->br_state);
 		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 		(*nextents)++;
@ -2202,6 +2187,7 @@ STATIC int				/* error */
 xfs_bmap_add_extent_unwritten_real(
 	struct xfs_trans	*tp,
 	xfs_inode_t		*ip,	/* incore inode pointer */
 	int			whichfork,
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
@ -2221,12 +2207,14 @@ xfs_bmap_add_extent_unwritten_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
-	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_mount	*mp = ip->i_mount;
 	*logflagsp = 0;
 	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if (whichfork == XFS_COW_FORK)
 		state |= BMAP_COWFORK;
 	ASSERT(*idx >= 0);
 	ASSERT(*idx <= xfs_iext_count(ifp));
@ -2285,7 +2273,7 @@ xfs_bmap_add_extent_unwritten_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (*idx < xfs_iext_count(&ip->i_df) - 1) {
+	if (*idx < xfs_iext_count(ifp) - 1) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
 		if (isnullstartblock(RIGHT.br_startblock))
@ -2325,7 +2313,8 @@ xfs_bmap_add_extent_unwritten_real(
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		xfs_iext_remove(ip, *idx + 1, 2, state);
-		ip->i_d.di_nextents -= 2;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@ -2368,7 +2357,8 @@ xfs_bmap_add_extent_unwritten_real(
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		xfs_iext_remove(ip, *idx + 1, 1, state);
-		ip->i_d.di_nextents--;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@ -2403,7 +2393,8 @@ xfs_bmap_add_extent_unwritten_real(
 		xfs_bmbt_set_state(ep, newext);
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		xfs_iext_remove(ip, *idx + 1, 1, state);
-		ip->i_d.di_nextents--;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@ -2515,7 +2506,8 @@ xfs_bmap_add_extent_unwritten_real(
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		xfs_iext_insert(ip, *idx, 1, new, state);
-		ip->i_d.di_nextents++;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@ -2593,7 +2585,8 @@ xfs_bmap_add_extent_unwritten_real(
 		++*idx;
 		xfs_iext_insert(ip, *idx, 1, new, state);
-		ip->i_d.di_nextents++;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@ -2641,7 +2634,8 @@ xfs_bmap_add_extent_unwritten_real(
 		++*idx;
 		xfs_iext_insert(ip, *idx, 2, &r[0], state);
-		ip->i_d.di_nextents += 2;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@ -2695,17 +2689,17 @@ xfs_bmap_add_extent_unwritten_real(
 	}
 	/* update reverse mappings */
-	error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new);
+	error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new);
 	if (error)
 		goto done;
 	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
 		int	tmp_logflags;	/* partial log flag return val */
 		ASSERT(cur == NULL);
 		error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
-				0, &tmp_logflags, XFS_DATA_FORK);
+				0, &tmp_logflags, whichfork);
 		*logflagsp |= tmp_logflags;
 		if (error)
 			goto done;
@ -2717,7 +2711,7 @@ xfs_bmap_add_extent_unwritten_real(
 		*curp = cur;
 	}
-	xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
+	xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
 done:
 	*logflagsp |= rval;
 	return error;
@ -2809,7 +2803,8 @@ xfs_bmap_add_extent_hole_delay(
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 					 oldlen);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
 			nullstartblock((int)newlen));
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
@ -2830,7 +2825,8 @@ xfs_bmap_add_extent_hole_delay(
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 					 oldlen);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
 			nullstartblock((int)newlen));
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
@ -2846,7 +2842,8 @@ xfs_bmap_add_extent_hole_delay(
 		temp = new->br_blockcount + right.br_blockcount;
 		oldlen = startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 					 oldlen);
 		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
 			new->br_startoff,
 			nullstartblock((int)newlen), temp, right.br_state);
@ -2899,13 +2896,14 @@ xfs_bmap_add_extent_hole_real(
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!bma->cur ||
 	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
 	ASSERT(whichfork != XFS_COW_FORK);
 	XFS_STATS_INC(mp, xs_add_exlist);
 	state = 0;
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
 	if (whichfork == XFS_COW_FORK)
 		state |= BMAP_COWFORK;
 	/*
 	 * Check and set flags if this segment has a left neighbor.
@ -3822,17 +3820,13 @@ xfs_bmap_btalloc(
 		 * the first block that was allocated.
 		 */
 		ASSERT(*ap->firstblock == NULLFSBLOCK ||
-		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) <=
-		       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+		       XFS_FSB_TO_AGNO(mp, args.fsbno));
 		       (ap->dfops->dop_low &&
 			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
 			XFS_FSB_TO_AGNO(mp, args.fsbno)));
 		ap->blkno = args.fsbno;
 		if (*ap->firstblock == NULLFSBLOCK)
 			*ap->firstblock = args.fsbno;
-		ASSERT(nullfb || fb_agno == args.agno ||
+		ASSERT(nullfb || fb_agno <= args.agno);
 		       (ap->dfops->dop_low && fb_agno < args.agno));
 		ap->length = args.len;
 		if (!(ap->flags & XFS_BMAPI_COWFORK))
 			ap->ip->i_d.di_nblocks += args.len;
@ -4368,10 +4362,16 @@ xfs_bmapi_allocate(
 	bma->got.br_state = XFS_EXT_NORM;
 	/*
-	 * A wasdelay extent has been initialized, so shouldn't be flagged
+	 * In the data fork, a wasdelay extent has been initialized, so
-	 * as unwritten.
+	 * shouldn't be flagged as unwritten.
 	 *
 	 * For the cow fork, however, we convert delalloc reservations
 	 * (extents allocated for speculative preallocation) to
 	 * allocated unwritten extents, and only convert the unwritten
 	 * extents to real extents when we're about to write the data.
 	 */
-	if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
+	if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
 	    (bma->flags & XFS_BMAPI_PREALLOC) &&
 	    xfs_sb_version_hasextflgbit(&mp->m_sb))
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
@ -4422,8 +4422,6 @@ xfs_bmapi_convert_unwritten(
 			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
 		return 0;
 	ASSERT(whichfork != XFS_COW_FORK);
 	/*
 	 * Modify (by adding) the state flag, if writing.
 	 */
@ -4448,8 +4446,8 @@ xfs_bmapi_convert_unwritten(
 			return error;
 	}
-	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
-			&bma->cur, mval, bma->firstblock, bma->dfops,
+			&bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops,
 			&tmp_logflags);
 	/*
 	 * Log the inode core unconditionally in the unwritten extent conversion
@ -4458,8 +4456,12 @@ xfs_bmapi_convert_unwritten(
 	 * in the transaction for the sake of fsync(), even if nothing has
 	 * changed, because fsync() will not force the log for this transaction
 	 * unless it sees the inode pinned.
 	 *
 	 * Note: If we're only converting cow fork extents, there aren't
 	 * any on-disk updates to make, so we don't need to log anything.
 	 */
-	bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
+	if (whichfork != XFS_COW_FORK)
 		bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
 	if (error)
 		return error;
@ -4533,15 +4535,15 @@ xfs_bmapi_write(
 	ASSERT(*nmap >= 1);
 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
 	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
-	ASSERT(tp != NULL);
+	ASSERT(tp != NULL ||
 	       (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
 			(XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
 	ASSERT(len > 0);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
 	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
 	ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
 	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
 	ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
 	/* zeroing is for currently only for data extents, not metadata */
 	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@ -4746,13 +4748,9 @@ error0:
 	if (bma.cur) {
 		if (!error) {
 			ASSERT(*firstblock == NULLFSBLOCK ||
-			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
+			       XFS_FSB_TO_AGNO(mp, *firstblock) <=
 			       XFS_FSB_TO_AGNO(mp,
-				       bma.cur->bc_private.b.firstblock) ||
+				       bma.cur->bc_private.b.firstblock));
 			       (dfops->dop_low &&
 				XFS_FSB_TO_AGNO(mp, *firstblock) <
 				XFS_FSB_TO_AGNO(mp,
 					bma.cur->bc_private.b.firstblock)));
 			*firstblock = bma.cur->bc_private.b.firstblock;
 		}
 		xfs_btree_del_cursor(bma.cur,
@ -4787,34 +4785,59 @@ xfs_bmap_split_indlen(
 	xfs_filblks_t			len2 = *indlen2;
 	xfs_filblks_t			nres = len1 + len2; /* new total res. */
 	xfs_filblks_t			stolen = 0;
 	xfs_filblks_t			resfactor;
 	/*
 	 * Steal as many blocks as we can to try and satisfy the worst case
 	 * indlen for both new extents.
 	 */
-	while (nres > ores && avail) {
+	if (ores < nres && avail)
-		nres--;
+		stolen = XFS_FILBLKS_MIN(nres - ores, avail);
-		avail--;
+	ores += stolen;
-		stolen++;
+
-	}
+	 /* nothing else to do if we've satisfied the new reservation */
 	if (ores >= nres)
 		return stolen;
 	/*
-	 * The only blocks available are those reserved for the original
+	 * We can't meet the total required reservation for the two extents.
-	 * extent and what we can steal from the extent being removed.
+	 * Calculate the percent of the overall shortage between both extents
-	 * If this still isn't enough to satisfy the combined
+	 * and apply this percentage to each of the requested indlen values.
-	 * requirements for the two new extents, skim blocks off of each
+	 * This distributes the shortage fairly and reduces the chances that one
-	 * of the new reservations until they match what is available.
+	 * of the two extents is left with nothing when extents are repeatedly
 	 * split.
 	 */
-	while (nres > ores) {
+	resfactor = (ores * 100);
-		if (len1) {
+	do_div(resfactor, nres);
-			len1--;
+	len1 *= resfactor;
-			nres--;
+	do_div(len1, 100);
 	len2 *= resfactor;
 	do_div(len2, 100);
 	ASSERT(len1 + len2 <= ores);
 	ASSERT(len1 < *indlen1 && len2 < *indlen2);
 	/*
 	 * Hand out the remainder to each extent. If one of the two reservations
 	 * is zero, we want to make sure that one gets a block first. The loop
 	 * below starts with len1, so hand len2 a block right off the bat if it
 	 * is zero.
 	 */
 	ores -= (len1 + len2);
 	ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores);
 	if (ores && !len2 && *indlen2) {
 		len2++;
 		ores--;
 	}
 	while (ores) {
 		if (len1 < *indlen1) {
 			len1++;
 			ores--;
 		}
-		if (nres == ores)
+		if (!ores)
 			break;
-		if (len2) {
+		if (len2 < *indlen2) {
-			len2--;
+			len2++;
-			nres--;
+			ores--;
 		}
 	}
@ -5556,8 +5579,8 @@ __xfs_bunmapi(
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
 			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
-					&lastx, &cur, &del, firstblock, dfops,
+					whichfork, &lastx, &cur, &del,
-					&logflags);
+					firstblock, dfops, &logflags);
 			if (error)
 				goto error0;
 			goto nodelete;
@ -5610,8 +5633,9 @@ __xfs_bunmapi(
 				prev.br_state = XFS_EXT_UNWRITTEN;
 				lastx--;
 				error = xfs_bmap_add_extent_unwritten_real(tp,
-						ip, &lastx, &cur, &prev,
+						ip, whichfork, &lastx, &cur,
-						firstblock, dfops, &logflags);
+						&prev, firstblock, dfops,
 						&logflags);
 				if (error)
 					goto error0;
 				goto nodelete;
@ -5619,8 +5643,9 @@ __xfs_bunmapi(
 				ASSERT(del.br_state == XFS_EXT_NORM);
 				del.br_state = XFS_EXT_UNWRITTEN;
 				error = xfs_bmap_add_extent_unwritten_real(tp,
-						ip, &lastx, &cur, &del,
+						ip, whichfork, &lastx, &cur,
-						firstblock, dfops, &logflags);
+						&del, firstblock, dfops,
 						&logflags);
 				if (error)
 					goto error0;
 				goto nodelete;
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@ -71,15 +71,9 @@ xfs_bmdr_to_bmbt(
 	xfs_bmbt_key_t		*tkp;
 	__be64			*tpp;
-	if (xfs_sb_version_hascrc(&mp->m_sb))
+	xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
-		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
 				 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
 				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
 	else
 		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
 				 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
 				 XFS_BTREE_LONG_PTRS);
 	rblock->bb_level = dblock->bb_level;
 	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
 	rblock->bb_numrecs = dblock->bb_numrecs;
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@ -50,8 +50,18 @@ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
 	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
 	  XFS_REFC_CRC_MAGIC }
 };
-#define xfs_btree_magic(cur) \
+
-	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
+__uint32_t
 xfs_btree_magic(
 	int			crc,
 	xfs_btnum_t		btnum)
 {
 	__uint32_t		magic = xfs_magics[crc][btnum];
 	/* Ensure we asked for crc for crc-only magics. */
 	ASSERT(magic != 0);
 	return magic;
 }
 STATIC int				/* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lblock(
@ -62,10 +72,13 @@ xfs_btree_check_lblock(
 {
 	int			lblock_ok = 1; /* block passes checks */
 	struct xfs_mount	*mp;	/* file system mount point */
 	xfs_btnum_t		btnum = cur->bc_btnum;
 	int			crc;
 	mp = cur->bc_mp;
 	crc = xfs_sb_version_hascrc(&mp->m_sb);
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+	if (crc) {
 		lblock_ok = lblock_ok &&
 			uuid_equal(&block->bb_u.l.bb_uuid,
 				   &mp->m_sb.sb_meta_uuid) &&
@ -74,7 +87,7 @@ xfs_btree_check_lblock(
 	}
 	lblock_ok = lblock_ok &&
-		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
 		be16_to_cpu(block->bb_level) == level &&
 		be16_to_cpu(block->bb_numrecs) <=
 			cur->bc_ops->get_maxrecs(cur, level) &&
@ -110,13 +123,16 @@ xfs_btree_check_sblock(
 	struct xfs_agf		*agf;	/* ag. freespace structure */
 	xfs_agblock_t		agflen;	/* native ag. freespace length */
 	int			sblock_ok = 1; /* block passes checks */
 	xfs_btnum_t		btnum = cur->bc_btnum;
 	int			crc;
 	mp = cur->bc_mp;
 	crc = xfs_sb_version_hascrc(&mp->m_sb);
 	agbp = cur->bc_private.a.agbp;
 	agf = XFS_BUF_TO_AGF(agbp);
 	agflen = be32_to_cpu(agf->agf_length);
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+	if (crc) {
 		sblock_ok = sblock_ok &&
 			uuid_equal(&block->bb_u.s.bb_uuid,
 				   &mp->m_sb.sb_meta_uuid) &&
@ -125,7 +141,7 @@ xfs_btree_check_sblock(
 	}
 	sblock_ok = sblock_ok &&
-		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
 		be16_to_cpu(block->bb_level) == level &&
 		be16_to_cpu(block->bb_numrecs) <=
 			cur->bc_ops->get_maxrecs(cur, level) &&
@ -810,7 +826,8 @@ xfs_btree_read_bufl(
 	xfs_daddr_t		d;		/* real disk block address */
 	int			error;
-	ASSERT(fsbno != NULLFSBLOCK);
+	if (!XFS_FSB_SANITY_CHECK(mp, fsbno))
 		return -EFSCORRUPTED;
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 				   mp->m_bsize, lock, &bp, ops);
@ -1084,12 +1101,15 @@ xfs_btree_init_block_int(
 	struct xfs_mount	*mp,
 	struct xfs_btree_block	*buf,
 	xfs_daddr_t		blkno,
-	__u32			magic,
+	xfs_btnum_t		btnum,
 	__u16			level,
 	__u16			numrecs,
 	__u64			owner,
 	unsigned int		flags)
 {
 	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
 	__u32			magic = xfs_btree_magic(crc, btnum);
 	buf->bb_magic = cpu_to_be32(magic);
 	buf->bb_level = cpu_to_be16(level);
 	buf->bb_numrecs = cpu_to_be16(numrecs);
@ -1097,7 +1117,7 @@ xfs_btree_init_block_int(
 	if (flags & XFS_BTREE_LONG_PTRS) {
 		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
 		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
-		if (flags & XFS_BTREE_CRC_BLOCKS) {
+		if (crc) {
 			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
 			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
 			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
@ -1110,7 +1130,7 @@ xfs_btree_init_block_int(
 		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
 		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-		if (flags & XFS_BTREE_CRC_BLOCKS) {
+		if (crc) {
 			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
 			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
 			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
@ -1123,14 +1143,14 @@ void
 xfs_btree_init_block(
 	struct xfs_mount *mp,
 	struct xfs_buf	*bp,
-	__u32		magic,
+	xfs_btnum_t	btnum,
 	__u16		level,
 	__u16		numrecs,
 	__u64		owner,
 	unsigned int	flags)
 {
 	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-				 magic, level, numrecs, owner, flags);
+				 btnum, level, numrecs, owner, flags);
 }
 STATIC void
@ -1140,7 +1160,7 @@ xfs_btree_init_block_cur(
 	int			level,
 	int			numrecs)
 {
-	__u64 owner;
+	__u64			owner;
 	/*
 	 * we can pull the owner from the cursor right now as the different
@ -1154,7 +1174,7 @@ xfs_btree_init_block_cur(
 		owner = cur->bc_private.a.agno;
 	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-				 xfs_btree_magic(cur), level, numrecs,
+				 cur->bc_btnum, level, numrecs,
 				 owner, cur->bc_flags);
 }
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@ -76,6 +76,8 @@ union xfs_btree_rec {
 #define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi)
 #define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
 __uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
 /*
 * For logging record fields.
 */
@ -378,7 +380,7 @@ void
 xfs_btree_init_block(
 	struct xfs_mount *mp,
 	struct xfs_buf	*bp,
-	__u32		magic,
+	xfs_btnum_t	btnum,
 	__u16		level,
 	__u16		numrecs,
 	__u64		owner,
@ -389,7 +391,7 @@ xfs_btree_init_block_int(
 	struct xfs_mount	*mp,
 	struct xfs_btree_block	*buf,
 	xfs_daddr_t		blkno,
-	__u32			magic,
+	xfs_btnum_t		btnum,
 	__u16			level,
 	__u16			numrecs,
 	__u64			owner,
@ -456,7 +458,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
 #define	XFS_FILBLKS_MAX(a,b)	max_t(xfs_filblks_t, (a), (b))
 #define	XFS_FSB_SANITY_CHECK(mp,fsb)	\
-	(XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+	(fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
 		XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
 /*
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@ -2633,7 +2633,7 @@ out_free:
 /*
 * Readahead the dir/attr block.
 */
-xfs_daddr_t
+int
 xfs_da_reada_buf(
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
@ -2664,7 +2664,5 @@ out_free:
 	if (mapp != &map)
 		kmem_free(mapp);
-	if (error)
+	return error;
 		return -1;
 	return mappedbno;
 }
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@ -201,7 +201,7 @@ int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
 			       struct xfs_buf **bpp, int whichfork,
 			       const struct xfs_buf_ops *ops);
-xfs_daddr_t	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+int	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
 				xfs_daddr_t mapped_bno, int whichfork,
 				const struct xfs_buf_ops *ops);
 int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
 	.verify_write = xfs_dir3_free_write_verify,
 };
 /* Everything ok in the free block header? */
 static bool
 xfs_dir3_free_header_check(
 	struct xfs_inode	*dp,
 	xfs_dablk_t		fbno,
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = dp->i_mount;
 	unsigned int		firstdb;
 	int			maxbests;
 	maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo);
 	firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
 		   xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
 			maxbests;
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
 		if (be32_to_cpu(hdr3->firstdb) != firstdb)
 			return false;
 		if (be32_to_cpu(hdr3->nvalid) > maxbests)
 			return false;
 		if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
 			return false;
 	} else {
 		struct xfs_dir2_free_hdr *hdr = bp->b_addr;
 		if (be32_to_cpu(hdr->firstdb) != firstdb)
 			return false;
 		if (be32_to_cpu(hdr->nvalid) > maxbests)
 			return false;
 		if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused))
 			return false;
 	}
 	return true;
 }
 static int
 __xfs_dir3_free_read(
@ -168,11 +204,22 @@ __xfs_dir3_free_read(
 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
 				XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
 	if (err || !*bpp)
 		return err;
 	/* Check things that we can't do in the verifier. */
 	if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) {
 		xfs_buf_ioerror(*bpp, -EFSCORRUPTED);
 		xfs_verifier_error(*bpp);
 		xfs_trans_brelse(tp, *bpp);
 		return -EFSCORRUPTED;
 	}
 	/* try read returns without an error or *bpp if it lands in a hole */
-	if (!err && tp && *bpp)
+	if (tp)
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
-	return err;
+
 	return 0;
 }
 int
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment(
 	struct xfs_mount	*mp)
 {
 	if (xfs_sb_version_hasalign(&mp->m_sb) &&
-	    mp->m_sb.sb_inoalignmt >=
+	    mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
 			XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
 		return mp->m_sb.sb_inoalignmt;
 	return 1;
 }
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@ -26,6 +26,7 @@
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
@ -429,11 +430,13 @@ xfs_iformat_btree(
 	/* REFERENCED */
 	int			nrecs;
 	int			size;
 	int			level;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
 	level = be16_to_cpu(dfp->bb_level);
 	/*
 	 * blow out if -- fork has less extents than can fit in
@ -446,7 +449,8 @@ xfs_iformat_btree(
 					XFS_IFORK_MAXEXT(ip, whichfork) ||
 		     XFS_BMDR_SPACE_CALC(nrecs) >
 					XFS_DFORK_SIZE(dip, mp, whichfork) ||
-		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) ||
 		     level == 0 || level > XFS_BTREE_MAXLEVELS) {
 		xfs_warn(mp, "corrupt inode %Lu (btree).",
 					(unsigned long long) ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@ -497,15 +501,14 @@ xfs_iread_extents(
 	 * We know that the size is valid (it's checked in iformat_btree)
 	 */
 	ifp->if_bytes = ifp->if_real_bytes = 0;
 	ifp->if_flags |= XFS_IFEXTENTS;
 	xfs_iext_add(ifp, 0, nextents);
 	error = xfs_bmap_read_extents(tp, ip, whichfork);
 	if (error) {
 		xfs_iext_destroy(ifp);
 		ifp->if_flags &= ~XFS_IFEXTENTS;
 		return error;
 	}
 	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
 	ifp->if_flags |= XFS_IFEXTENTS;
 	return 0;
 }
 /*
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@ -42,7 +42,6 @@ typedef struct xlog_recover_item {
 	xfs_log_iovec_t		*ri_buf;	/* ptr to regions buffer */
 } xlog_recover_item_t;
 struct xlog_tid;
 typedef struct xlog_recover {
 	struct hlist_node	r_list;
 	xlog_tid_t		r_log_tid;	/* log's transaction id */
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@ -481,6 +481,12 @@ xfs_submit_ioend(
 	struct xfs_ioend	*ioend,
 	int			status)
 {
 	/* Convert CoW extents to regular */
 	if (!status && ioend->io_type == XFS_IO_COW) {
 		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 				ioend->io_offset, ioend->io_size);
 	}
 	/* Reserve log space if we might write beyond the on-disk inode size. */
 	if (!status &&
 	    ioend->io_type != XFS_IO_UNWRITTEN &&
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@ -88,7 +88,6 @@ int
 xfs_bmap_rtalloc(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
 {
 	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
 	int		error;		/* error return value */
 	xfs_mount_t	*mp;		/* mount point structure */
 	xfs_extlen_t	prod = 0;	/* product factor for allocators */
@ -155,18 +154,14 @@ xfs_bmap_rtalloc(
 	/*
 	 * Realtime allocation, done through xfs_rtallocate_extent.
 	 */
 	atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
 	do_div(ap->blkno, mp->m_sb.sb_rextsize);
 	rtb = ap->blkno;
 	ap->length = ralen;
-	if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
+	error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
-				&ralen, atype, ap->wasdel, prod, &rtb)))
+				&ralen, ap->wasdel, prod, &rtb);
-		return error;
+	if (error)
 	if (rtb == NULLFSBLOCK && prod > 1 &&
 	    (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
 					   ap->length, &ralen, atype,
 					   ap->wasdel, 1, &rtb)))
 		return error;
 	ap->blkno = rtb;
 	if (ap->blkno != NULLFSBLOCK) {
 		ap->blkno *= mp->m_sb.sb_rextsize;
@ -787,11 +782,9 @@ xfs_getbmap(
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	for (i = 0; i < cur_ext; i++) {
 		int full = 0;	/* user array is full */
 		/* format results & advance arg */
-		error = formatter(&arg, &out[i], &full);
+		error = formatter(&arg, &out[i]);
-		if (error || full)
+		if (error)
 			break;
 	}
@ -917,17 +910,18 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
 */
 int
 xfs_free_eofblocks(
-	xfs_mount_t	*mp,
+	struct xfs_inode	*ip)
 	xfs_inode_t	*ip,
 	bool		need_iolock)
 {
-	xfs_trans_t	*tp;
+	struct xfs_trans	*tp;
-	int		error;
+	int			error;
-	xfs_fileoff_t	end_fsb;
+	xfs_fileoff_t		end_fsb;
-	xfs_fileoff_t	last_fsb;
+	xfs_fileoff_t		last_fsb;
-	xfs_filblks_t	map_len;
+	xfs_filblks_t		map_len;
-	int		nimaps;
+	int			nimaps;
-	xfs_bmbt_irec_t	imap;
+	struct xfs_bmbt_irec	imap;
 	struct xfs_mount	*mp = ip->i_mount;
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	/*
 	 * Figure out if there are any blocks beyond the end
@ -944,6 +938,10 @@ xfs_free_eofblocks(
 	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	/*
 	 * If there are blocks after the end of file, truncate the file to its
 	 * current size to free them up.
 	 */
 	if (!error && (nimaps != 0) &&
 	    (imap.br_startblock != HOLESTARTBLOCK ||
 	     ip->i_delayed_blks)) {
@ -954,22 +952,13 @@ xfs_free_eofblocks(
 		if (error)
 			return error;
-		/*
+		/* wait on dio to ensure i_size has settled */
-		 * There are blocks after the end of file.
+		inode_dio_wait(VFS_I(ip));
 		 * Free them up now by truncating the file to
 		 * its current size.
 		 */
 		if (need_iolock) {
 			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
 				return -EAGAIN;
 		}
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
 				&tp);
 		if (error) {
 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
 			if (need_iolock)
 				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			return error;
 		}
@ -997,8 +986,6 @@ xfs_free_eofblocks(
 		}
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (need_iolock)
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	}
 	return error;
 }
@ -1393,10 +1380,16 @@ xfs_shift_file_space(
 	xfs_fileoff_t		stop_fsb;
 	xfs_fileoff_t		next_fsb;
 	xfs_fileoff_t		shift_fsb;
 	uint			resblks;
 	ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
 	if (direction == SHIFT_LEFT) {
 		/*
 		 * Reserve blocks to cover potential extent merges after left
 		 * shift operations.
 		 */
 		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
 		next_fsb = XFS_B_TO_FSB(mp, offset + len);
 		stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
 	} else {
@ -1404,6 +1397,7 @@ xfs_shift_file_space(
 		 * If right shift, delegate the work of initialization of
 		 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
 		 */
 		resblks = 0;
 		next_fsb = NULLFSBLOCK;
 		stop_fsb = XFS_B_TO_FSB(mp, offset);
 	}
@ -1415,7 +1409,7 @@ xfs_shift_file_space(
 	 * into the accessible region of the file.
 	 */
 	if (xfs_can_free_eofblocks(ip, true)) {
-		error = xfs_free_eofblocks(mp, ip, false);
+		error = xfs_free_eofblocks(ip);
 		if (error)
 			return error;
 	}
@ -1445,21 +1439,14 @@ xfs_shift_file_space(
 	}
 	while (!error && !done) {
-		/*
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
-		 * We would need to reserve permanent block for transaction.
+					&tp);
 		 * This will come into picture when after shifting extent into
 		 * hole we found that adjacent extents can be merged which
 		 * may lead to freeing of a block during record update.
 		 */
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
 				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
 		if (error)
 			break;
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
-				ip->i_gdquot, ip->i_pdquot,
+				ip->i_gdquot, ip->i_pdquot, resblks, 0,
 				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
 				XFS_QMOPT_RES_REGBLKS);
 		if (error)
 			goto out_trans_cancel;
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@ -35,7 +35,7 @@ int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
 		xfs_fileoff_t start_fsb, xfs_fileoff_t length);
 /* bmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *);
 int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
 		xfs_bmap_format_t formatter, void *arg);
@ -63,8 +63,7 @@ int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
-int	xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
+int	xfs_free_eofblocks(struct xfs_inode *ip);
 			   bool need_iolock);
 int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
 			 struct xfs_swapext *sx);
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks(
 	 */
 	bp->b_last_error = 0;
 	bp->b_retries = 0;
 	bp->b_first_retry_time = 0;
 	xfs_buf_do_callbacks(bp);
 	bp->b_fspriv = NULL;
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@ -208,32 +208,3 @@ xfs_ioc_trim(
 		return -EFAULT;
 	return 0;
 }
 int
 xfs_discard_extents(
 	struct xfs_mount	*mp,
 	struct list_head	*list)
 {
 	struct xfs_extent_busy	*busyp;
 	int			error = 0;
 	list_for_each_entry(busyp, list, list) {
 		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
 					 busyp->length);
 		error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
 				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
 				XFS_FSB_TO_BB(mp, busyp->length),
 				GFP_NOFS, 0);
 		if (error && error != -EOPNOTSUPP) {
 			xfs_info(mp,
 	 "discard failed for extent [0x%llx,%u], error %d",
 				 (unsigned long long)busyp->bno,
 				 busyp->length,
 				 error);
 			return error;
 		}
 	}
 	return 0;
 }
--- a/fs/xfs/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
@ -5,6 +5,5 @@ struct fstrim_range;
 struct list_head;
 extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
 extern int	xfs_discard_extents(struct xfs_mount *, struct list_head *);
 #endif /* XFS_DISCARD_H */
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@ -45,18 +45,7 @@ xfs_extent_busy_insert(
 	struct rb_node		**rbp;
 	struct rb_node		*parent = NULL;
-	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
+	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
 	if (!new) {
 		/*
 		 * No Memory!  Since it is now not possible to track the free
 		 * block, make this a synchronous transaction to insure that
 		 * the block is not reused before this transaction commits.
 		 */
 		trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
 		xfs_trans_set_sync(tp);
 		return;
 	}
 	new->agno = agno;
 	new->bno = bno;
 	new->length = len;
@ -345,25 +334,31 @@ restart:
 * subset of the extent that is not busy.  If *rlen is smaller than
 * args->minlen no suitable extent could be found, and the higher level
 * code needs to force out the log and retry the allocation.
 *
 * Return the current busy generation for the AG if the extent is busy. This
 * value can be used to wait for at least one of the currently busy extents
 * to be cleared. Note that the busy list is not guaranteed to be empty after
 * the gen is woken. The state of a specific extent must always be confirmed
 * with another call to xfs_extent_busy_trim() before it can be used.
 */
-void
+bool
 xfs_extent_busy_trim(
 	struct xfs_alloc_arg	*args,
-	xfs_agblock_t		bno,
+	xfs_agblock_t		*bno,
-	xfs_extlen_t		len,
+	xfs_extlen_t		*len,
-	xfs_agblock_t		*rbno,
+	unsigned		*busy_gen)
 	xfs_extlen_t		*rlen)
 {
 	xfs_agblock_t		fbno;
 	xfs_extlen_t		flen;
 	struct rb_node		*rbp;
 	bool			ret = false;
-	ASSERT(len > 0);
+	ASSERT(*len > 0);
 	spin_lock(&args->pag->pagb_lock);
 restart:
-	fbno = bno;
+	fbno = *bno;
-	flen = len;
+	flen = *len;
 	rbp = args->pag->pagb_tree.rb_node;
 	while (rbp && flen >= args->minlen) {
 		struct xfs_extent_busy *busyp =
@ -515,24 +510,25 @@ restart:
 		flen = fend - fbno;
 	}
-	spin_unlock(&args->pag->pagb_lock);
+out:
-	if (fbno != bno || flen != len) {
+	if (fbno != *bno || flen != *len) {
-		trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
+		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
 					  fbno, flen);
 		*bno = fbno;
 		*len = flen;
 		*busy_gen = args->pag->pagb_gen;
 		ret = true;
 	}
-	*rbno = fbno;
+	spin_unlock(&args->pag->pagb_lock);
-	*rlen = flen;
+	return ret;
 	return;
 fail:
 	/*
 	 * Return a zero extent length as failure indications.  All callers
 	 * re-check if the trimmed extent satisfies the minlen requirement.
 	 */
-	spin_unlock(&args->pag->pagb_lock);
+	flen = 0;
-	trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+	goto out;
 	*rbno = fbno;
 	*rlen = 0;
 }
 STATIC void
@ -551,6 +547,21 @@ xfs_extent_busy_clear_one(
 	kmem_free(busyp);
 }
 static void
 xfs_extent_busy_put_pag(
 	struct xfs_perag	*pag,
 	bool			wakeup)
 		__releases(pag->pagb_lock)
 {
 	if (wakeup) {
 		pag->pagb_gen++;
 		wake_up_all(&pag->pagb_wait);
 	}
 	spin_unlock(&pag->pagb_lock);
 	xfs_perag_put(pag);
 }
 /*
 * Remove all extents on the passed in list from the busy extents tree.
 * If do_discard is set skip extents that need to be discarded, and mark
@ -565,27 +576,76 @@ xfs_extent_busy_clear(
 	struct xfs_extent_busy	*busyp, *n;
 	struct xfs_perag	*pag = NULL;
 	xfs_agnumber_t		agno = NULLAGNUMBER;
 	bool			wakeup = false;
 	list_for_each_entry_safe(busyp, n, list, list) {
 		if (busyp->agno != agno) {
-			if (pag) {
+			if (pag)
-				spin_unlock(&pag->pagb_lock);
+				xfs_extent_busy_put_pag(pag, wakeup);
 				xfs_perag_put(pag);
 			}
 			pag = xfs_perag_get(mp, busyp->agno);
 			spin_lock(&pag->pagb_lock);
 			agno = busyp->agno;
 			pag = xfs_perag_get(mp, agno);
 			spin_lock(&pag->pagb_lock);
 			wakeup = false;
 		}
 		if (do_discard && busyp->length &&
-		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
+		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
 			busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
-		else
+		} else {
 			xfs_extent_busy_clear_one(mp, pag, busyp);
 			wakeup = true;
 		}
 	}
-	if (pag) {
+	if (pag)
-		spin_unlock(&pag->pagb_lock);
+		xfs_extent_busy_put_pag(pag, wakeup);
 }
 /*
 * Flush out all busy extents for this AG.
 */
 void
 xfs_extent_busy_flush(
 	struct xfs_mount	*mp,
 	struct xfs_perag	*pag,
 	unsigned		busy_gen)
 {
 	DEFINE_WAIT		(wait);
 	int			log_flushed = 0, error;
 	trace_xfs_log_force(mp, 0, _THIS_IP_);
 	error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
 	if (error)
 		return;
 	do {
 		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
 		if  (busy_gen != READ_ONCE(pag->pagb_gen))
 			break;
 		schedule();
 	} while (1);
 	finish_wait(&pag->pagb_wait, &wait);
 }
 void
 xfs_extent_busy_wait_all(
 	struct xfs_mount	*mp)
 {
 	DEFINE_WAIT		(wait);
 	xfs_agnumber_t		agno;
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		struct xfs_perag *pag = xfs_perag_get(mp, agno);
 		do {
 			prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
 			if  (RB_EMPTY_ROOT(&pag->pagb_tree))
 				break;
 			schedule();
 		} while (1);
 		finish_wait(&pag->pagb_wait, &wait);
 		xfs_perag_put(pag);
 	}
 }
@ -596,9 +656,17 @@ xfs_extent_busy_clear(
 int
 xfs_extent_busy_ag_cmp(
 	void			*priv,
-	struct list_head	*a,
+	struct list_head	*l1,
-	struct list_head	*b)
+	struct list_head	*l2)
 {
-	return container_of(a, struct xfs_extent_busy, list)->agno -
+	struct xfs_extent_busy	*b1 =
-		container_of(b, struct xfs_extent_busy, list)->agno;
+		container_of(l1, struct xfs_extent_busy, list);
 	struct xfs_extent_busy	*b2 =
 		container_of(l2, struct xfs_extent_busy, list);
 	s32 diff;
 	diff = b1->agno - b2->agno;
 	if (!diff)
 		diff = b1->bno - b2->bno;
 	return diff;
 }
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@ -58,9 +58,16 @@ void
 xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
 	xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
 bool
 xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
 		xfs_extlen_t *len, unsigned *busy_gen);
 void
-xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
+xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag,
-	xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
+	unsigned busy_gen);
 void
 xfs_extent_busy_wait_all(struct xfs_mount *mp);
 int
 xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@ -527,6 +527,15 @@ xfs_file_dio_aio_write(
 	if ((iocb->ki_pos & mp->m_blockmask) ||
 	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
 		unaligned_io = 1;
 		/*
 		 * We can't properly handle unaligned direct I/O to reflink
 		 * files yet, as we can't unshare a partial block.
 		 */
 		if (xfs_is_reflink_inode(ip)) {
 			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
 			return -EREMCHG;
 		}
 		iolock = XFS_IOLOCK_EXCL;
 	} else {
 		iolock = XFS_IOLOCK_SHARED;
@ -552,14 +561,6 @@ xfs_file_dio_aio_write(
 	}
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 	/* If this is a block-aligned directio CoW, remap immediately. */
 	if (xfs_is_reflink_inode(ip) && !unaligned_io) {
 		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
 		if (ret)
 			goto out;
 	}
 	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
 out:
 	xfs_iunlock(ip, iolock);
@ -614,8 +615,10 @@ xfs_file_buffered_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	int			enospc = 0;
-	int			iolock = XFS_IOLOCK_EXCL;
+	int			iolock;
 write_retry:
 	iolock = XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, iolock);
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
@ -625,7 +628,6 @@ xfs_file_buffered_aio_write(
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
 write_retry:
 	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
 	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
 	if (likely(ret >= 0))
@ -641,18 +643,21 @@ write_retry:
 	 * running at the same time.
 	 */
 	if (ret == -EDQUOT && !enospc) {
 		xfs_iunlock(ip, iolock);
 		enospc = xfs_inode_free_quota_eofblocks(ip);
 		if (enospc)
 			goto write_retry;
 		enospc = xfs_inode_free_quota_cowblocks(ip);
 		if (enospc)
 			goto write_retry;
 		iolock = 0;
 	} else if (ret == -ENOSPC && !enospc) {
 		struct xfs_eofblocks eofb = {0};
 		enospc = 1;
 		xfs_flush_inodes(ip->i_mount);
-		eofb.eof_scan_owner = ip->i_ino; /* for locking */
+
 		xfs_iunlock(ip, iolock);
 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
 		goto write_retry;
@ -660,7 +665,8 @@ write_retry:
 	current->backing_dev_info = NULL;
 out:
-	xfs_iunlock(ip, iolock);
+	if (iolock)
 		xfs_iunlock(ip, iolock);
 	return ret;
 }
@ -908,9 +914,9 @@ xfs_dir_open(
 	 */
 	mode = xfs_ilock_data_map_shared(ip);
 	if (ip->i_d.di_nextents > 0)
-		xfs_dir3_data_readahead(ip, 0, -1);
+		error = xfs_dir3_data_readahead(ip, 0, -1);
 	xfs_iunlock(ip, mode);
-	return 0;
+	return error;
 }
 STATIC int
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@ -352,12 +352,7 @@ xfs_growfs_data_private(
 			goto error0;
 		}
-		if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0);
 			xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1,
 						agno, XFS_BTREE_CRC_BLOCKS);
 		else
 			xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1,
 						agno, 0);
 		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
@ -381,12 +376,7 @@ xfs_growfs_data_private(
 			goto error0;
 		}
-		if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0);
 			xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1,
 						agno, XFS_BTREE_CRC_BLOCKS);
 		else
 			xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1,
 						agno, 0);
 		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
@ -413,8 +403,8 @@ xfs_growfs_data_private(
 				goto error0;
 			}
-			xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0,
+			xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0,
-						agno, XFS_BTREE_CRC_BLOCKS);
+						agno, 0);
 			block = XFS_BUF_TO_BLOCK(bp);
@ -488,12 +478,7 @@ xfs_growfs_data_private(
 			goto error0;
 		}
-		if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
 			xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0,
 						agno, XFS_BTREE_CRC_BLOCKS);
 		else
 			xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0,
 						agno, 0);
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
@ -513,13 +498,8 @@ xfs_growfs_data_private(
 				goto error0;
 			}
-			if (xfs_sb_version_hascrc(&mp->m_sb))
+			xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO,
-				xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC,
+						     0, 0, agno, 0);
 						     0, 0, agno,
 						     XFS_BTREE_CRC_BLOCKS);
 			else
 				xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
 						     0, agno, 0);
 			error = xfs_bwrite(bp);
 			xfs_buf_relse(bp);
@ -540,9 +520,8 @@ xfs_growfs_data_private(
 				goto error0;
 			}
-			xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC,
+			xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC,
-					     0, 0, agno,
+					     0, 0, agno, 0);
 					     XFS_BTREE_CRC_BLOCKS);
 			error = xfs_bwrite(bp);
 			xfs_buf_relse(bp);
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@ -1322,13 +1322,10 @@ xfs_inode_free_eofblocks(
 	int			flags,
 	void			*args)
 {
-	int ret;
+	int ret = 0;
 	struct xfs_eofblocks *eofb = args;
 	bool need_iolock = true;
 	int match;
 	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
 	if (!xfs_can_free_eofblocks(ip, false)) {
 		/* inode could be preallocated or append-only */
 		trace_xfs_inode_free_eofblocks_invalid(ip);
@ -1356,21 +1353,19 @@ xfs_inode_free_eofblocks(
 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
 			return 0;
 		/*
 		 * A scan owner implies we already hold the iolock. Skip it in
 		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
 		 * the possibility of EAGAIN being returned.
 		 */
 		if (eofb->eof_scan_owner == ip->i_ino)
 			need_iolock = false;
 	}
-	ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
+	/*
-
+	 * If the caller is waiting, return -EAGAIN to keep the background
-	/* don't revisit the inode if we're not waiting */
+	 * scanner moving and revisit the inode in a subsequent pass.
-	if (ret == -EAGAIN && !(flags & SYNC_WAIT))
+	 */
-		ret = 0;
+	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 		if (flags & SYNC_WAIT)
 			ret = -EAGAIN;
 		return ret;
 	}
 	ret = xfs_free_eofblocks(ip);
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
@ -1417,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks(
 	struct xfs_eofblocks eofb = {0};
 	struct xfs_dquot *dq;
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	/*
-	 * Set the scan owner to avoid a potential livelock. Otherwise, the scan
+	 * Run a sync scan to increase effectiveness and use the union filter to
 	 * can repeatedly trylock on the inode we're currently processing. We
 	 * run a sync scan to increase effectiveness and use the union filter to
 	 * cover all applicable quotas in a single scan.
 	 */
 	eofb.eof_scan_owner = ip->i_ino;
 	eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
 	if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
@ -1577,12 +1567,9 @@ xfs_inode_free_cowblocks(
 {
 	int ret;
 	struct xfs_eofblocks *eofb = args;
 	bool need_iolock = true;
 	int match;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
 	/*
 	 * Just clear the tag if we have an empty cow fork or none at all. It's
 	 * possible the inode was fully unshared since it was originally tagged.
@ -1615,28 +1602,16 @@ xfs_inode_free_cowblocks(
 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
 			return 0;
 		/*
 		 * A scan owner implies we already hold the iolock. Skip it in
 		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
 		 * the possibility of EAGAIN being returned.
 		 */
 		if (eofb->eof_scan_owner == ip->i_ino)
 			need_iolock = false;
 	}
 	/* Free the CoW blocks */
-	if (need_iolock) {
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 	}
 	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
-	if (need_iolock) {
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
-		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	}
 	return ret;
 }
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@ -27,7 +27,6 @@ struct xfs_eofblocks {
 	kgid_t		eof_gid;
 	prid_t		eof_prid;
 	__u64		eof_min_file_size;
 	xfs_ino_t	eof_scan_owner;
 };
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user(
 	dst->eof_flags = src->eof_flags;
 	dst->eof_prid = src->eof_prid;
 	dst->eof_min_file_size = src->eof_min_file_size;
 	dst->eof_scan_owner = NULLFSINO;
 	dst->eof_uid = INVALID_UID;
 	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@ -1692,32 +1692,34 @@ xfs_release(
 	if (xfs_can_free_eofblocks(ip, false)) {
 		/*
-		 * If we can't get the iolock just skip truncating the blocks
+		 * Check if the inode is being opened, written and closed
-		 * past EOF because we could deadlock with the mmap_sem
+		 * frequently and we have delayed allocation blocks outstanding
-		 * otherwise.  We'll get another chance to drop them once the
+		 * (e.g. streaming writes from the NFS server), truncating the
-		 * last reference to the inode is dropped, so we'll never leak
+		 * blocks past EOF will cause fragmentation to occur.
 		 * blocks permanently.
 		 *
-		 * Further, check if the inode is being opened, written and
+		 * In this case don't do the truncation, but we have to be
-		 * closed frequently and we have delayed allocation blocks
+		 * careful how we detect this case. Blocks beyond EOF show up as
-		 * outstanding (e.g. streaming writes from the NFS server),
+		 * i_delayed_blks even when the inode is clean, so we need to
-		 * truncating the blocks past EOF will cause fragmentation to
+		 * truncate them away first before checking for a dirty release.
-		 * occur.
+		 * Hence on the first dirty close we will still remove the
-		 *
+		 * speculative allocation, but after that we will leave it in
-		 * In this case don't do the truncation, either, but we have to
+		 * place.
 		 * be careful how we detect this case. Blocks beyond EOF show
 		 * up as i_delayed_blks even when the inode is clean, so we
 		 * need to truncate them away first before checking for a dirty
 		 * release. Hence on the first dirty close we will still remove
 		 * the speculative allocation, but after that we will leave it
 		 * in place.
 		 */
 		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
 			return 0;
-
+		/*
-		error = xfs_free_eofblocks(mp, ip, true);
+		 * If we can't get the iolock just skip truncating the blocks
-		if (error && error != -EAGAIN)
+		 * past EOF because we could deadlock with the mmap_sem
-			return error;
+		 * otherwise. We'll get another chance to drop them once the
 		 * last reference to the inode is dropped, so we'll never leak
 		 * blocks permanently.
 		 */
 		if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 			error = xfs_free_eofblocks(ip);
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			if (error)
 				return error;
 		}
 		/* delalloc blocks after truncation means it really is dirty */
 		if (ip->i_delayed_blks)
@ -1904,8 +1906,11 @@ xfs_inactive(
 		 * cache. Post-eof blocks must be freed, lest we end up with
 		 * broken free space accounting.
 		 */
-		if (xfs_can_free_eofblocks(ip, true))
+		if (xfs_can_free_eofblocks(ip, true)) {
-			xfs_free_eofblocks(mp, ip, false);
+			xfs_ilock(ip, XFS_IOLOCK_EXCL);
 			xfs_free_eofblocks(ip);
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 		}
 		return;
 	}
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@ -1524,7 +1524,7 @@ out_drop_write:
 }
 STATIC int
-xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+xfs_getbmap_format(void **ap, struct getbmapx *bmv)
 {
 	struct getbmap __user	*base = (struct getbmap __user *)*ap;
@ -1567,7 +1567,7 @@ xfs_ioc_getbmap(
 }
 STATIC int
-xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv)
 {
 	struct getbmapx __user	*base = (struct getbmapx __user *)*ap;
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@ -162,7 +162,7 @@ xfs_iomap_write_direct(
 	xfs_fileoff_t	last_fsb;
 	xfs_filblks_t	count_fsb, resaligned;
 	xfs_fsblock_t	firstfsb;
-	xfs_extlen_t	extsz, temp;
+	xfs_extlen_t	extsz;
 	int		nimaps;
 	int		quota_flag;
 	int		rt;
@ -203,14 +203,7 @@ xfs_iomap_write_direct(
 	}
 	count_fsb = last_fsb - offset_fsb;
 	ASSERT(count_fsb > 0);
-
+	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz);
 	resaligned = count_fsb;
 	if (unlikely(extsz)) {
 		if ((temp = do_mod(offset_fsb, extsz)))
 			resaligned += temp;
 		if ((temp = do_mod(resaligned, extsz)))
 			resaligned += extsz - temp;
 	}
 	if (unlikely(rt)) {
 		resrtextents = qblocks = resaligned;
@ -685,7 +678,7 @@ xfs_iomap_write_allocate(
 	int		nres;
 	if (whichfork == XFS_COW_FORK)
-		flags |= XFS_BMAPI_COWFORK;
+		flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
 	/*
 	 * Make sure that the dquots are there.
@ -1002,47 +995,31 @@ xfs_file_iomap_begin(
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 	if (xfs_is_reflink_inode(ip) &&
 	    (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) {
 		shared = xfs_reflink_find_cow_mapping(ip, offset, &imap);
 		if (shared) {
 			xfs_iunlock(ip, lockmode);
 			goto alloc_done;
 		}
 		ASSERT(!isnullstartblock(imap.br_startblock));
 	}
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, 0);
 	if (error)
 		goto out_unlock;
-	if ((flags & IOMAP_REPORT) ||
+	if (flags & IOMAP_REPORT) {
 	    (xfs_is_reflink_inode(ip) &&
 	     (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) {
 		/* Trim the mapping to the nearest shared extent boundary. */
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
 				&trimmed);
 		if (error)
 			goto out_unlock;
 		/*
 		 * We're here because we're trying to do a directio write to a
 		 * region that isn't aligned to a filesystem block.  If the
 		 * extent is shared, fall back to buffered mode to handle the
 		 * RMW.
 		 */
 		if (!(flags & IOMAP_REPORT) && shared) {
 			trace_xfs_reflink_bounce_dio_write(ip, &imap);
 			error = -EREMCHG;
 			goto out_unlock;
 		}
 	}
 	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
-		error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+		if (flags & IOMAP_DIRECT) {
-		if (error)
+			/* may drop and re-acquire the ilock */
-			goto out_unlock;
+			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
 					&lockmode);
 			if (error)
 				goto out_unlock;
 		} else {
 			error = xfs_reflink_reserve_cow(ip, &imap, &shared);
 			if (error)
 				goto out_unlock;
 		}
 		end_fsb = imap.br_startoff + imap.br_blockcount;
 		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@ -1071,7 +1048,6 @@ xfs_file_iomap_begin(
 		if (error)
 			return error;
 alloc_done:
 		iomap->flags = IOMAP_F_NEW;
 		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
 	} else {
@ -1102,7 +1078,19 @@ xfs_file_iomap_end_delalloc(
 	xfs_fileoff_t		end_fsb;
 	int			error = 0;
-	start_fsb = XFS_B_TO_FSB(mp, offset + written);
+	/* behave as if the write failed if drop writes is enabled */
 	if (xfs_mp_drop_writes(mp))
 		written = 0;
 	/*
 	 * start_fsb refers to the first unused block after a short write. If
 	 * nothing was written, round offset down to point at the first block in
 	 * the range.
 	 */
 	if (unlikely(!written))
 		start_fsb = XFS_B_TO_FSBT(mp, offset);
 	else
 		start_fsb = XFS_B_TO_FSB(mp, offset + written);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 	/*
@ -1114,6 +1102,9 @@ xfs_file_iomap_end_delalloc(
 	 * blocks in the range, they are ours.
 	 */
 	if (start_fsb < end_fsb) {
 		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
 					 XFS_FSB_TO_B(mp, end_fsb) - 1);
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
 					       end_fsb - start_fsb);
@ -1144,7 +1135,7 @@ xfs_file_iomap_end(
 	return 0;
 }
-struct iomap_ops xfs_iomap_ops = {
+const struct iomap_ops xfs_iomap_ops = {
 	.iomap_begin		= xfs_file_iomap_begin,
 	.iomap_end		= xfs_file_iomap_end,
 };
@ -1190,6 +1181,6 @@ out_unlock:
 	return error;
 }
-struct iomap_ops xfs_xattr_iomap_ops = {
+const struct iomap_ops xfs_xattr_iomap_ops = {
 	.iomap_begin		= xfs_xattr_iomap_begin,
 };
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@ -33,7 +33,27 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
 xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
-extern struct iomap_ops xfs_iomap_ops;
+static inline xfs_filblks_t
-extern struct iomap_ops xfs_xattr_iomap_ops;
+xfs_aligned_fsb_count(
 	xfs_fileoff_t		offset_fsb,
 	xfs_filblks_t		count_fsb,
 	xfs_extlen_t		extsz)
 {
 	if (extsz) {
 		xfs_extlen_t	align;
 		align = do_mod(offset_fsb, extsz);
 		if (align)
 			count_fsb += align;
 		align = do_mod(count_fsb, extsz);
 		if (align)
 			count_fsb += extsz - align;
 	}
 	return count_fsb;
 }
 extern const struct iomap_ops xfs_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
 #endif /* __XFS_IOMAP_H__*/
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@ -124,7 +124,6 @@ struct xlog_ticket;
 struct xfs_log_item;
 struct xfs_item_ops;
 struct xfs_trans;
 struct xfs_log_callback;
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
 		       struct xlog_ticket *ticket,
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@ -30,6 +30,9 @@
 #include "xfs_trans_priv.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_trace.h"
 struct workqueue_struct *xfs_discard_wq;
 /*
 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
@ -491,6 +494,75 @@ xlog_cil_free_logvec(
 	}
 }
 static void
 xlog_discard_endio_work(
 	struct work_struct	*work)
 {
 	struct xfs_cil_ctx	*ctx =
 		container_of(work, struct xfs_cil_ctx, discard_endio_work);
 	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
 	xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
 	kmem_free(ctx);
 }
 /*
 * Queue up the actual completion to a thread to avoid IRQ-safe locking for
 * pagb_lock.  Note that we need a unbounded workqueue, otherwise we might
 * get the execution delayed up to 30 seconds for weird reasons.
 */
 static void
 xlog_discard_endio(
 	struct bio		*bio)
 {
 	struct xfs_cil_ctx	*ctx = bio->bi_private;
 	INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
 	queue_work(xfs_discard_wq, &ctx->discard_endio_work);
 }
 static void
 xlog_discard_busy_extents(
 	struct xfs_mount	*mp,
 	struct xfs_cil_ctx	*ctx)
 {
 	struct list_head	*list = &ctx->busy_extents;
 	struct xfs_extent_busy	*busyp;
 	struct bio		*bio = NULL;
 	struct blk_plug		plug;
 	int			error = 0;
 	ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
 	blk_start_plug(&plug);
 	list_for_each_entry(busyp, list, list) {
 		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
 					 busyp->length);
 		error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
 				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
 				XFS_FSB_TO_BB(mp, busyp->length),
 				GFP_NOFS, 0, &bio);
 		if (error && error != -EOPNOTSUPP) {
 			xfs_info(mp,
 	 "discard failed for extent [0x%llx,%u], error %d",
 				 (unsigned long long)busyp->bno,
 				 busyp->length,
 				 error);
 			break;
 		}
 	}
 	if (bio) {
 		bio->bi_private = ctx;
 		bio->bi_end_io = xlog_discard_endio;
 		submit_bio(bio);
 	} else {
 		xlog_discard_endio_work(&ctx->discard_endio_work);
 	}
 	blk_finish_plug(&plug);
 }
 /*
 * Mark all items committed and clear busy extents. We free the log vector
 * chains in a separate pass so that we unpin the log items as quickly as
@ -525,14 +597,10 @@ xlog_cil_committed(
 	xlog_cil_free_logvec(ctx->lv_chain);
-	if (!list_empty(&ctx->busy_extents)) {
+	if (!list_empty(&ctx->busy_extents))
-		ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+		xlog_discard_busy_extents(mp, ctx);
-
+	else
-		xfs_discard_extents(mp, &ctx->busy_extents);
+		kmem_free(ctx);
 		xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
 	}
 	kmem_free(ctx);
 }
 /*
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@ -257,6 +257,7 @@ struct xfs_cil_ctx {
 	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
 	struct xfs_log_callback	log_cb;		/* completion callback hook. */
 	struct list_head	committing;	/* ctx committing list */
 	struct work_struct	discard_endio_work;
 };
 /*
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@ -45,6 +45,7 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_reflink.h"
 #include "xfs_extent_busy.h"
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@ -187,7 +188,7 @@ xfs_initialize_perag(
 	xfs_agnumber_t	*maxagi)
 {
 	xfs_agnumber_t	index;
-	xfs_agnumber_t	first_initialised = 0;
+	xfs_agnumber_t	first_initialised = NULLAGNUMBER;
 	xfs_perag_t	*pag;
 	int		error = -ENOMEM;
@ -202,22 +203,21 @@ xfs_initialize_perag(
 			xfs_perag_put(pag);
 			continue;
 		}
 		if (!first_initialised)
 			first_initialised = index;
 		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
 		if (!pag)
-			goto out_unwind;
+			goto out_unwind_new_pags;
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 		spin_lock_init(&pag->pag_ici_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		if (xfs_buf_hash_init(pag))
-			goto out_unwind;
+			goto out_free_pag;
 		init_waitqueue_head(&pag->pagb_wait);
 		if (radix_tree_preload(GFP_NOFS))
-			goto out_unwind;
+			goto out_hash_destroy;
 		spin_lock(&mp->m_perag_lock);
 		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
@ -225,10 +225,13 @@ xfs_initialize_perag(
 			spin_unlock(&mp->m_perag_lock);
 			radix_tree_preload_end();
 			error = -EEXIST;
-			goto out_unwind;
+			goto out_hash_destroy;
 		}
 		spin_unlock(&mp->m_perag_lock);
 		radix_tree_preload_end();
 		/* first new pag is fully initialized */
 		if (first_initialised == NULLAGNUMBER)
 			first_initialised = index;
 	}
 	index = xfs_set_inode_alloc(mp, agcount);
@ -239,11 +242,16 @@ xfs_initialize_perag(
 	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
 	return 0;
-out_unwind:
+out_hash_destroy:
 	xfs_buf_hash_destroy(pag);
 out_free_pag:
 	kmem_free(pag);
-	for (; index > first_initialised; index--) {
+out_unwind_new_pags:
 	/* unwind any prior newly initialized pags */
 	for (index = first_initialised; index < agcount; index++) {
 		pag = radix_tree_delete(&mp->m_perag_tree, index);
 		if (!pag)
 			break;
 		xfs_buf_hash_destroy(pag);
 		kmem_free(pag);
 	}
@ -1072,6 +1080,13 @@ xfs_unmountfs(
 	 */
 	xfs_log_force(mp, XFS_LOG_SYNC);
 	/*
 	 * Wait for all busy extents to be freed, including completion of
 	 * any discard operation.
 	 */
 	xfs_extent_busy_wait_all(mp);
 	flush_workqueue(xfs_discard_wq);
 	/*
 	 * We now need to tell the world we are unmounting. This will allow
 	 * us to detect that the filesystem is going away and we should error
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@ -200,11 +200,12 @@ typedef struct xfs_mount {
 	/*
 	 * DEBUG mode instrumentation to test and/or trigger delayed allocation
 	 * block killing in the event of failed writes. When enabled, all
-	 * buffered writes are forced to fail. All delalloc blocks in the range
+	 * buffered writes are silenty dropped and handled as if they failed.
-	 * of the write (including pre-existing delalloc blocks!) are tossed as
+	 * All delalloc blocks in the range of the write (including pre-existing
-	 * part of the write failure error handling sequence.
+	 * delalloc blocks!) are tossed as part of the write failure error
 	 * handling sequence.
 	 */
-	bool			m_fail_writes;
+	bool			m_drop_writes;
 #endif
 } xfs_mount_t;
@ -325,13 +326,13 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 #ifdef DEBUG
 static inline bool
-xfs_mp_fail_writes(struct xfs_mount *mp)
+xfs_mp_drop_writes(struct xfs_mount *mp)
 {
-	return mp->m_fail_writes;
+	return mp->m_drop_writes;
 }
 #else
 static inline bool
-xfs_mp_fail_writes(struct xfs_mount *mp)
+xfs_mp_drop_writes(struct xfs_mount *mp)
 {
 	return 0;
 }
@ -384,6 +385,8 @@ typedef struct xfs_perag {
 	xfs_agino_t	pagl_rightrec;
 	spinlock_t	pagb_lock;	/* lock for pagb_tree */
 	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
 	unsigned int	pagb_gen;	/* generation count for pagb_tree */
 	wait_queue_head_t pagb_wait;	/* woken when pagb_gen changes */
 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@ -82,11 +82,22 @@
 * mappings are a reservation against the free space in the filesystem;
 * adjacent mappings can also be combined into fewer larger mappings.
 *
 * As an optimization, the CoW extent size hint (cowextsz) creates
 * outsized aligned delalloc reservations in the hope of landing out of
 * order nearby CoW writes in a single extent on disk, thereby reducing
 * fragmentation and improving future performance.
 *
 * D: --RRRRRRSSSRRRRRRRR--- (data fork)
 * C: ------DDDDDDD--------- (CoW fork)
 *
 * When dirty pages are being written out (typically in writepage), the
- * delalloc reservations are converted into real mappings by allocating
+ * delalloc reservations are converted into unwritten mappings by
- * blocks and replacing the delalloc mapping with real ones.  A delalloc
+ * allocating blocks and replacing the delalloc mapping with real ones.
- * mapping can be replaced by several real ones if the free space is
+ * A delalloc mapping can be replaced by several unwritten ones if the
- * fragmented.
+ * free space is fragmented.
 *
 * D: --RRRRRRSSSRRRRRRRR---
 * C: ------UUUUUUU---------
 *
 * We want to adapt the delalloc mechanism for copy-on-write, since the
 * write paths are similar.  The first two steps (creating the reservation
@ -101,13 +112,29 @@
 * Block-aligned directio writes will use the same mechanism as buffered
 * writes.
 *
 * Just prior to submitting the actual disk write requests, we convert
 * the extents representing the range of the file actually being written
 * (as opposed to extra pieces created for the cowextsize hint) to real
 * extents.  This will become important in the next step:
 *
 * D: --RRRRRRSSSRRRRRRRR---
 * C: ------UUrrUUU---------
 *
 * CoW remapping must be done after the data block write completes,
 * because we don't want to destroy the old data fork map until we're sure
 * the new block has been written.  Since the new mappings are kept in a
 * separate fork, we can simply iterate these mappings to find the ones
 * that cover the file blocks that we just CoW'd.  For each extent, simply
 * unmap the corresponding range in the data fork, map the new range into
- * the data fork, and remove the extent from the CoW fork.
+ * the data fork, and remove the extent from the CoW fork.  Because of
 * the presence of the cowextsize hint, however, we must be careful
 * only to remap the blocks that we've actually written out --  we must
 * never remap delalloc reservations nor CoW staging blocks that have
 * yet to be written.  This corresponds exactly to the real extents in
 * the CoW fork:
 *
 * D: --RRRRRRrrSRRRRRRRR---
 * C: ------UU--UUU---------
 *
 * Since the remapping operation can be applied to an arbitrary file
 * range, we record the need for the remap step as a flag in the ioend
@ -296,103 +323,165 @@ xfs_reflink_reserve_cow(
 	return 0;
 }
-/* Allocate all CoW reservations covering a range of blocks in a file. */
+/* Convert part of an unwritten CoW extent to a real one. */
-static int
+STATIC int
-__xfs_reflink_allocate_cow(
+xfs_reflink_convert_cow_extent(
-	struct xfs_inode	*ip,
+	struct xfs_inode		*ip,
-	xfs_fileoff_t		*offset_fsb,
+	struct xfs_bmbt_irec		*imap,
-	xfs_fileoff_t		end_fsb)
+	xfs_fileoff_t			offset_fsb,
 	xfs_filblks_t			count_fsb,
 	struct xfs_defer_ops		*dfops)
 {
-	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fsblock_t			first_block;
-	struct xfs_bmbt_irec	imap;
+	int				nimaps = 1;
 	struct xfs_defer_ops	dfops;
 	struct xfs_trans	*tp;
 	xfs_fsblock_t		first_block;
 	int			nimaps = 1, error;
 	bool			shared;
-	xfs_defer_init(&dfops, &first_block);
+	if (imap->br_state == XFS_EXT_NORM)
 		return 0;
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+	xfs_trim_extent(imap, offset_fsb, count_fsb);
-			XFS_TRANS_RESERVE, &tp);
+	trace_xfs_reflink_convert_cow(ip, imap);
-	if (error)
+	if (imap->br_blockcount == 0)
-		return error;
+		return 0;
-
+	return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
+			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block,
-
+			0, imap, &nimaps, dfops);
 	/* Read extent from the source file. */
 	nimaps = 1;
 	error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
 			&imap, &nimaps, 0);
 	if (error)
 		goto out_unlock;
 	ASSERT(nimaps == 1);
 	error = xfs_reflink_reserve_cow(ip, &imap, &shared);
 	if (error)
 		goto out_trans_cancel;
 	if (!shared) {
 		*offset_fsb = imap.br_startoff + imap.br_blockcount;
 		goto out_trans_cancel;
 	}
 	xfs_trans_ijoin(tp, ip, 0);
 	error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
 			XFS_BMAPI_COWFORK, &first_block,
 			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
 			&imap, &nimaps, &dfops);
 	if (error)
 		goto out_trans_cancel;
 	error = xfs_defer_finish(&tp, &dfops, NULL);
 	if (error)
 		goto out_trans_cancel;
 	error = xfs_trans_commit(tp);
 	*offset_fsb = imap.br_startoff + imap.br_blockcount;
 out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 out_trans_cancel:
 	xfs_defer_cancel(&dfops);
 	xfs_trans_cancel(tp);
 	goto out_unlock;
 }
-/* Allocate all CoW reservations covering a part of a file. */
+/* Convert all of the unwritten CoW extents in a file's range to real ones. */
 int
-xfs_reflink_allocate_cow_range(
+xfs_reflink_convert_cow(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
 	xfs_off_t		count)
 {
 	struct xfs_bmbt_irec	got;
 	struct xfs_defer_ops	dfops;
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
-	int			error;
+	xfs_extnum_t		idx;
 	bool			found;
 	int			error = 0;
-	ASSERT(xfs_is_reflink_inode(ip));
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	trace_xfs_reflink_allocate_cow_range(ip, offset, count);
+	/* Convert all the extents to real from unwritten. */
-
+	for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
-	/*
+	     found && got.br_startoff < end_fsb;
-	 * Make sure that the dquots are there.
+	     found = xfs_iext_get_extent(ifp, ++idx, &got)) {
-	 */
+		error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
-	error = xfs_qm_dqattach(ip, 0);
+				end_fsb - offset_fsb, &dfops);
-	if (error)
+		if (error)
 		return error;
 	while (offset_fsb < end_fsb) {
 		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
 		if (error) {
 			trace_xfs_reflink_allocate_cow_range_error(ip, error,
 					_RET_IP_);
 			break;
 		}
 	}
 	/* Finish up. */
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 /* Allocate all CoW reservations covering a range of blocks in a file. */
 int
 xfs_reflink_allocate_cow(
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*imap,
 	bool			*shared,
 	uint			*lockmode)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb = imap->br_startoff;
 	xfs_filblks_t		count_fsb = imap->br_blockcount;
 	struct xfs_bmbt_irec	got;
 	struct xfs_defer_ops	dfops;
 	struct xfs_trans	*tp = NULL;
 	xfs_fsblock_t		first_block;
 	int			nimaps, error = 0;
 	bool			trimmed;
 	xfs_filblks_t		resaligned;
 	xfs_extlen_t		resblks = 0;
 	xfs_extnum_t		idx;
 retry:
 	ASSERT(xfs_is_reflink_inode(ip));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
 	/*
 	 * Even if the extent is not shared we might have a preallocation for
 	 * it in the COW fork.  If so use it.
 	 */
 	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) &&
 	    got.br_startoff <= offset_fsb) {
 		*shared = true;
 		/* If we have a real allocation in the COW fork we're done. */
 		if (!isnullstartblock(got.br_startblock)) {
 			xfs_trim_extent(&got, offset_fsb, count_fsb);
 			*imap = got;
 			goto convert;
 		}
 		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
 	} else {
 		error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
 		if (error || !*shared)
 			goto out;
 	}
 	if (!tp) {
 		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
 			imap->br_blockcount, xfs_get_cowextsz_hint(ip));
 		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
 		xfs_iunlock(ip, *lockmode);
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
 		*lockmode = XFS_ILOCK_EXCL;
 		xfs_ilock(ip, *lockmode);
 		if (error)
 			return error;
 		error = xfs_qm_dqattach_locked(ip, 0);
 		if (error)
 			goto out;
 		goto retry;
 	}
 	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
 			XFS_QMOPT_RES_REGBLKS);
 	if (error)
 		goto out;
 	xfs_trans_ijoin(tp, ip, 0);
 	xfs_defer_init(&dfops, &first_block);
 	nimaps = 1;
 	/* Allocate the entire reservation as unwritten blocks. */
 	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
 			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
 			resblks, imap, &nimaps, &dfops);
 	if (error)
 		goto out_bmap_cancel;
 	/* Finish up. */
 	error = xfs_defer_finish(&tp, &dfops, NULL);
 	if (error)
 		goto out_bmap_cancel;
 	error = xfs_trans_commit(tp);
 	if (error)
 		return error;
 convert:
 	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb,
 			&dfops);
 out_bmap_cancel:
 	xfs_defer_cancel(&dfops);
 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
 			XFS_QMOPT_RES_REGBLKS);
 out:
 	if (tp)
 		xfs_trans_cancel(tp);
 	return error;
 }
@ -641,6 +730,16 @@ xfs_reflink_end_cow(
 		ASSERT(!isnullstartblock(got.br_startblock));
 		/*
 		 * Don't remap unwritten extents; these are
 		 * speculatively preallocated CoW extents that have been
 		 * allocated but have not yet been involved in a write.
 		 */
 		if (got.br_state == XFS_EXT_UNWRITTEN) {
 			idx--;
 			goto next_extent;
 		}
 		/* Unmap the old blocks in the data fork. */
 		xfs_defer_init(&dfops, &firstfsb);
 		rlen = del.br_blockcount;
@ -855,13 +954,14 @@ STATIC int
 xfs_reflink_update_dest(
 	struct xfs_inode	*dest,
 	xfs_off_t		newlen,
-	xfs_extlen_t		cowextsize)
+	xfs_extlen_t		cowextsize,
 	bool			is_dedupe)
 {
 	struct xfs_mount	*mp = dest->i_mount;
 	struct xfs_trans	*tp;
 	int			error;
-	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+	if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
 		return 0;
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@ -882,6 +982,10 @@ xfs_reflink_update_dest(
 		dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 	}
 	if (!is_dedupe) {
 		xfs_trans_ichgtime(tp, dest,
 				   XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	}
 	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
 	error = xfs_trans_commit(tp);
@ -1195,7 +1299,8 @@ xfs_reflink_remap_range(
 	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
 		cowextsize = src->i_d.di_cowextsize;
-	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
+	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
 			is_dedupe);
 out_unlock:
 	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@ -28,8 +28,10 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap, bool *shared);
-extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
+extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
-		xfs_off_t offset, xfs_off_t count);
+		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t count);
 extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
 		struct xfs_bmbt_irec *imap);
 extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@ -1093,7 +1093,6 @@ xfs_rtallocate_extent(
 	xfs_extlen_t	minlen,		/* minimum length to allocate */
 	xfs_extlen_t	maxlen,		/* maximum length to allocate */
 	xfs_extlen_t	*len,		/* out: actual length allocated */
 	xfs_alloctype_t	type,		/* allocation type XFS_ALLOCTYPE... */
 	int		wasdel,		/* was a delayed allocation extent */
 	xfs_extlen_t	prod,		/* extent product factor */
 	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
@ -1123,27 +1122,16 @@ xfs_rtallocate_extent(
 		}
 	}
 retry:
 	sumbp = NULL;
-	/*
+	if (bno == 0) {
 	 * Allocate by size, or near another block, or exactly at some block.
 	 */
 	switch (type) {
 	case XFS_ALLOCTYPE_ANY_AG:
 		error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
 				&sumbp,	&sb, prod, &r);
-		break;
+	} else {
 	case XFS_ALLOCTYPE_NEAR_BNO:
 		error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
 				len, &sumbp, &sb, prod, &r);
 		break;
 	case XFS_ALLOCTYPE_THIS_BNO:
 		error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
 				len, &sumbp, &sb, prod, &r);
 		break;
 	default:
 		error = -EIO;
 		ASSERT(0);
 	}
 	if (error)
 		return error;
@ -1158,7 +1146,11 @@ xfs_rtallocate_extent(
 			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
 		else
 			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
 	} else if (prod > 1) {
 		prod = 1;
 		goto retry;
 	}
 	*rtblock = r;
 	return 0;
 }
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@ -40,7 +40,6 @@ xfs_rtallocate_extent(
 	xfs_extlen_t		minlen,	/* minimum length to allocate */
 	xfs_extlen_t		maxlen,	/* maximum length to allocate */
 	xfs_extlen_t		*len,	/* out: actual length allocated */
 	xfs_alloctype_t		type,	/* allocation type XFS_ALLOCTYPE... */
 	int			wasdel,	/* was a delayed allocation extent */
 	xfs_extlen_t		prod,	/* extent product factor */
 	xfs_rtblock_t		*rtblock); /* out: start block allocated */
@ -122,7 +121,7 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
 #else
-# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb)  (ENOSYS)
+# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@ -1956,12 +1956,20 @@ xfs_init_workqueues(void)
 	if (!xfs_alloc_wq)
 		return -ENOMEM;
 	xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0);
 	if (!xfs_discard_wq)
 		goto out_free_alloc_wq;
 	return 0;
 out_free_alloc_wq:
 	destroy_workqueue(xfs_alloc_wq);
 	return -ENOMEM;
 }
 STATIC void
 xfs_destroy_workqueues(void)
 {
 	destroy_workqueue(xfs_discard_wq);
 	destroy_workqueue(xfs_alloc_wq);
 }
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@ -73,6 +73,8 @@ extern const struct quotactl_ops xfs_quotactl_operations;
 extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
 extern struct workqueue_struct *xfs_discard_wq;
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
 #endif	/* __XFS_SUPER_H__ */
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@ -93,7 +93,7 @@ to_mp(struct kobject *kobject)
 #ifdef DEBUG
 STATIC ssize_t
-fail_writes_store(
+drop_writes_store(
 	struct kobject		*kobject,
 	const char		*buf,
 	size_t			count)
@ -107,9 +107,9 @@ fail_writes_store(
 		return ret;
 	if (val == 1)
-		mp->m_fail_writes = true;
+		mp->m_drop_writes = true;
 	else if (val == 0)
-		mp->m_fail_writes = false;
+		mp->m_drop_writes = false;
 	else
 		return -EINVAL;
@ -117,21 +117,21 @@ fail_writes_store(
 }
 STATIC ssize_t
-fail_writes_show(
+drop_writes_show(
 	struct kobject		*kobject,
 	char			*buf)
 {
 	struct xfs_mount	*mp = to_mp(kobject);
-	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0);
+	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_drop_writes ? 1 : 0);
 }
-XFS_SYSFS_ATTR_RW(fail_writes);
+XFS_SYSFS_ATTR_RW(drop_writes);
 #endif /* DEBUG */
 static struct attribute *xfs_mp_attrs[] = {
 #ifdef DEBUG
-	ATTR_LIST(fail_writes),
+	ATTR_LIST(drop_writes),
 #endif
 	NULL,
 };
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@ -2245,7 +2245,6 @@ DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
 /* deferred ops */
 struct xfs_defer_pending;
 struct xfs_defer_intake;
 struct xfs_defer_ops;
 DECLARE_EVENT_CLASS(xfs_defer_class,
@ -3089,6 +3088,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
 		__field(xfs_fileoff_t, lblk)
 		__field(xfs_extlen_t, len)
 		__field(xfs_fsblock_t, pblk)
 		__field(int, state)
 	),
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
@ -3096,13 +3096,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
 		__entry->lblk = irec->br_startoff;
 		__entry->len = irec->br_blockcount;
 		__entry->pblk = irec->br_startblock;
 		__entry->state = irec->br_state;
 	),
-	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu",
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->lblk,
 		  __entry->len,
-		  __entry->pblk)
+		  __entry->pblk,
 		  __entry->state)
 );
 #define DEFINE_INODE_IREC_EVENT(name) \
 DEFINE_EVENT(xfs_inode_irec_class, name, \
@ -3242,11 +3244,11 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
 DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
 DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
@ -3254,7 +3256,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@ -32,7 +32,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_trans_res;
 struct xfs_dquot_acct;
 struct xfs_busy_extent;
 struct xfs_rud_log_item;
 struct xfs_rui_log_item;
 struct xfs_btree_cur;
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@ -37,9 +37,9 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 }
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			struct iomap_ops *ops);
+			const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@ -72,7 +72,7 @@ static inline unsigned int dax_radix_order(void *entry)
 	return 0;
 }
 int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
+		pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
@ -80,7 +80,7 @@ static inline unsigned int dax_radix_order(void *entry)
 }
 static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd, unsigned int flags,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@ -72,17 +72,17 @@ struct iomap_ops {
 };
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-		bool *did_zero, struct iomap_ops *ops);
+		bool *did_zero, const struct iomap_ops *ops);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops);
+		const struct iomap_ops *ops);
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		loff_t start, loff_t len, struct iomap_ops *ops);
+		loff_t start, loff_t len, const struct iomap_ops *ops);
 /*
 * Flags for direct I/O ->end_io:
@ -92,6 +92,6 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
 		unsigned flags);
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops, iomap_dio_end_io_t end_io);
+		const struct iomap_ops *ops, iomap_dio_end_io_t end_io);
 #endif /* LINUX_IOMAP_H */