From 3ae4c9deb30a8d5ee305b461625dcb298c9804a9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 12:01:50 +1000 Subject: [PATCH 01/36] xfs: use range primitives for xfs page cache operations While XFS passes ranges to operate on from the core code, the functions being called ignore the either the entire range or the end of the range. This is historical because when the function were written linux didn't have the necessary range operations. Update the functions to use the correct operations. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_fs_subr.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 1f279b012f94..ed88ed16811c 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -32,10 +32,9 @@ xfs_tosspages( xfs_off_t last, int fiopt) { - struct address_space *mapping = VFS_I(ip)->i_mapping; - - if (mapping->nrpages) - truncate_inode_pages(mapping, first); + /* can't toss partial tail pages, so mask them out */ + last &= ~(PAGE_SIZE - 1); + truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1); } int @@ -50,12 +49,11 @@ xfs_flushinval_pages( trace_xfs_pagecache_inval(ip, first, last); - if (mapping->nrpages) { - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_write_and_wait(mapping); - if (!ret) - truncate_inode_pages(mapping, first); - } + xfs_iflags_clear(ip, XFS_ITRUNCATED); + ret = filemap_write_and_wait_range(mapping, first, + last == -1 ? LLONG_MAX : last); + if (!ret) + truncate_inode_pages_range(mapping, first, last); return -ret; } @@ -71,10 +69,9 @@ xfs_flush_pages( int ret = 0; int ret2; - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = -filemap_fdatawrite(mapping); - } + xfs_iflags_clear(ip, XFS_ITRUNCATED); + ret = -filemap_fdatawrite_range(mapping, first, + last == -1 ? LLONG_MAX : last); if (flags & XBF_ASYNC) return ret; ret2 = xfs_wait_on_pages(ip, first, last); @@ -91,7 +88,9 @@ xfs_wait_on_pages( { struct address_space *mapping = VFS_I(ip)->i_mapping; - if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - return -filemap_fdatawait(mapping); + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + return -filemap_fdatawait_range(mapping, first, + last == -1 ? ip->i_size - 1 : last); + } return 0; } From 447223520520b17d3b6d0631aa4838fbaf8eddb4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 12:02:11 +1000 Subject: [PATCH 02/36] xfs: Introduce XFS_IOC_ZERO_RANGE XFS_IOC_ZERO_RANGE is the equivalent of an atomic XFS_IOC_UNRESVSP/ XFS_IOC_RESVSP call pair. It enabled ranges of written data to be turned into zeroes without requiring IO or having to free and reallocate the extents in the range given as would occur if we had to punch and then preallocate them separately. This enables applications to zero parts of files very quickly without changing the layout of the files in any way. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_ioctl.c | 3 ++- fs/xfs/linux-2.6/xfs_ioctl32.c | 1 + fs/xfs/xfs_bmap.c | 12 +++++++++--- fs/xfs/xfs_bmap.h | 9 ++++++--- fs/xfs/xfs_fs.h | 1 + fs/xfs/xfs_vnodeops.c | 10 ++++++++-- 6 files changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 3b9e626f7cd1..03aa908a9cb9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -1301,7 +1301,8 @@ xfs_file_ioctl( case XFS_IOC_ALLOCSP64: case XFS_IOC_FREESP64: case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: { + case XFS_IOC_UNRESVSP64: + case XFS_IOC_ZERO_RANGE: { xfs_flock64_t bf; if (copy_from_user(&bf, arg, sizeof(bf))) diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 6c83f7f62dc9..464bcc76e980 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -574,6 +574,7 @@ xfs_file_compat_ioctl( case XFS_IOC_FSGEOMETRY_V1: case XFS_IOC_FSGROWFSDATA: case XFS_IOC_FSGROWFSRT: + case XFS_IOC_ZERO_RANGE: return xfs_file_ioctl(filp, cmd, p); #else case XFS_IOC_ALLOCSP_32: diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index f90dadd5a968..5e33b7862d41 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4744,8 +4744,12 @@ xfs_bmapi( * Check if writing previously allocated but * unwritten extents. */ - if (wr && mval->br_state == XFS_EXT_UNWRITTEN && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { + if (wr && + ((mval->br_state == XFS_EXT_UNWRITTEN && + ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) || + (mval->br_state == XFS_EXT_NORM && + ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) == + (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) { /* * Modify (by adding) the state flag, if writing. */ @@ -4757,7 +4761,9 @@ xfs_bmapi( *firstblock; cur->bc_private.b.flist = flist; } - mval->br_state = XFS_EXT_NORM; + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM + : XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent(ip, lastx, &cur, mval, firstblock, flist, &tmp_logflags, whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index b13569a6179b..71ec9b6ecdfc 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -74,9 +74,12 @@ typedef struct xfs_bmap_free #define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ /* combine contig. space */ #define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ -#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */ - /* need write cache flushing and no */ - /* additional allocation alignments */ +/* + * unwritten extent conversion - this needs write cache flushing and no additional + * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts + * from written to unwritten, otherwise convert from unwritten to written. + */ +#define XFS_BMAPI_CONVERT 0x200 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_WRITE, "WRITE" }, \ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 87c2e9d02288..10f6093671b0 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -448,6 +448,7 @@ typedef struct xfs_handle { /* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) +#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 4c7c7bfb2b2f..dc6e4fb8bbc9 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2272,7 +2272,7 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; nimaps = 1; - bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); + bmapi_flag = XFS_BMAPI_WRITE | alloc_type; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); allocatesize_fsb = XFS_B_TO_FSB(mp, count); @@ -2711,6 +2711,7 @@ xfs_change_file_space( xfs_off_t llen; xfs_trans_t *tp; struct iattr iattr; + int prealloc_type; if (!S_ISREG(ip->i_d.di_mode)) return XFS_ERROR(EINVAL); @@ -2753,12 +2754,17 @@ xfs_change_file_space( * size to be changed. */ setprealloc = clrprealloc = 0; + prealloc_type = XFS_BMAPI_PREALLOC; switch (cmd) { + case XFS_IOC_ZERO_RANGE: + prealloc_type |= XFS_BMAPI_CONVERT; + xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); + /* FALLTHRU */ case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - 1, attr_flags); + prealloc_type, attr_flags); if (error) return error; setprealloc = 1; From 52fda114249578311776b25da9f73a9c34f4fd8c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Sep 2010 01:44:22 +0000 Subject: [PATCH 03/36] xfs: simplify xfs_qm_dqusage_adjust There is no need to have the users and group/project quota locked at the same time. Get rid of xfs_qm_dqget_noattach and just do a xfs_qm_dqget inside xfs_qm_quotacheck_dqadjust for the quota we are operating on right now. The new version of xfs_qm_quotacheck_dqadjust holds the inode lock over it's operations, which is not a problem as it simply increments counters and there is no concern about log contention during mount time. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/quota/xfs_qm.c | 205 +++++++++++++----------------------------- 1 file changed, 62 insertions(+), 143 deletions(-) diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 9a92407109a1..121f632213a7 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1199,87 +1199,6 @@ xfs_qm_list_destroy( mutex_destroy(&(list->qh_lock)); } - -/* - * Stripped down version of dqattach. This doesn't attach, or even look at the - * dquots attached to the inode. The rationale is that there won't be any - * attached at the time this is called from quotacheck. - */ -STATIC int -xfs_qm_dqget_noattach( - xfs_inode_t *ip, - xfs_dquot_t **O_udqpp, - xfs_dquot_t **O_gdqpp) -{ - int error; - xfs_mount_t *mp; - xfs_dquot_t *udqp, *gdqp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - mp = ip->i_mount; - udqp = NULL; - gdqp = NULL; - - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(ip->i_udquot == NULL); - /* - * We want the dquot allocated if it doesn't exist. - */ - if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER, - XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &udqp))) { - /* - * Shouldn't be able to turn off quotas here. - */ - ASSERT(error != ESRCH); - ASSERT(error != ENOENT); - return error; - } - ASSERT(udqp); - } - - if (XFS_IS_OQUOTA_ON(mp)) { - ASSERT(ip->i_gdquot == NULL); - if (udqp) - xfs_dqunlock(udqp); - error = XFS_IS_GQUOTA_ON(mp) ? - xfs_qm_dqget(mp, ip, - ip->i_d.di_gid, XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN, - &gdqp) : - xfs_qm_dqget(mp, ip, - ip->i_d.di_projid, XFS_DQ_PROJ, - XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN, - &gdqp); - if (error) { - if (udqp) - xfs_qm_dqrele(udqp); - ASSERT(error != ESRCH); - ASSERT(error != ENOENT); - return error; - } - ASSERT(gdqp); - - /* Reacquire the locks in the right order */ - if (udqp) { - if (! xfs_qm_dqlock_nowait(udqp)) { - xfs_dqunlock(gdqp); - xfs_dqlock(udqp); - xfs_dqlock(gdqp); - } - } - } - - *O_udqpp = udqp; - *O_gdqpp = gdqp; - -#ifdef QUOTADEBUG - if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp)); - if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp)); -#endif - return 0; -} - /* * Create an inode and return with a reference already taken, but unlocked * This is how we create quota inodes @@ -1546,18 +1465,34 @@ xfs_qm_dqiterate( /* * Called by dqusage_adjust in doing a quotacheck. - * Given the inode, and a dquot (either USR or GRP, doesn't matter), - * this updates its incore copy as well as the buffer copy. This is - * so that once the quotacheck is done, we can just log all the buffers, - * as opposed to logging numerous updates to individual dquots. + * + * Given the inode, and a dquot id this updates both the incore dqout as well + * as the buffer copy. This is so that once the quotacheck is done, we can + * just log all the buffers, as opposed to logging numerous updates to + * individual dquots. */ -STATIC void +STATIC int xfs_qm_quotacheck_dqadjust( - xfs_dquot_t *dqp, + struct xfs_inode *ip, + xfs_dqid_t id, + uint type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { - ASSERT(XFS_DQ_IS_LOCKED(dqp)); + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *dqp; + int error; + + error = xfs_qm_dqget(mp, ip, id, type, + XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp); + if (error) { + /* + * Shouldn't be able to turn off quotas here. + */ + ASSERT(error != ESRCH); + ASSERT(error != ENOENT); + return error; + } trace_xfs_dqadjust(dqp); @@ -1582,11 +1517,13 @@ xfs_qm_quotacheck_dqadjust( * There are no timers for the default values set in the root dquot. */ if (dqp->q_core.d_id) { - xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); - xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); + xfs_qm_adjust_dqlimits(mp, &dqp->q_core); + xfs_qm_adjust_dqtimers(mp, &dqp->q_core); } dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_qm_dqput(dqp); + return 0; } STATIC int @@ -1629,8 +1566,7 @@ xfs_qm_dqusage_adjust( int *res) /* result code value */ { xfs_inode_t *ip; - xfs_dquot_t *udqp, *gdqp; - xfs_qcnt_t nblks, rtblks; + xfs_qcnt_t nblks, rtblks = 0; int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -1650,51 +1586,24 @@ xfs_qm_dqusage_adjust( * the case in all other instances. It's OK that we do this because * quotacheck is done only at mount time. */ - if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip))) { + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip); + if (error) { *res = BULKSTAT_RV_NOTHING; return error; } - /* - * Obtain the locked dquots. In case of an error (eg. allocation - * fails for ENOSPC), we return the negative of the error number - * to bulkstat, so that it can get propagated to quotacheck() and - * making us disable quotas for the file system. - */ - if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - IRELE(ip); - *res = BULKSTAT_RV_GIVEUP; - return error; - } + ASSERT(ip->i_delayed_blks == 0); - rtblks = 0; - if (! XFS_IS_REALTIME_INODE(ip)) { - nblks = (xfs_qcnt_t)ip->i_d.di_nblocks; - } else { + if (XFS_IS_REALTIME_INODE(ip)) { /* * Walk thru the extent list and count the realtime blocks. */ - if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - IRELE(ip); - if (udqp) - xfs_qm_dqput(udqp); - if (gdqp) - xfs_qm_dqput(gdqp); - *res = BULKSTAT_RV_GIVEUP; - return error; - } - nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; + error = xfs_qm_get_rtblks(ip, &rtblks); + if (error) + goto error0; } - ASSERT(ip->i_delayed_blks == 0); - /* - * We can't release the inode while holding its dquot locks. - * The inode can go into inactive and might try to acquire the dquotlocks. - * So, just unlock here and do a vn_rele at the end. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); + nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; /* * Add the (disk blocks and inode) resources occupied by this @@ -1709,26 +1618,36 @@ xfs_qm_dqusage_adjust( * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(udqp); - xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); - xfs_qm_dqput(udqp); + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid, + XFS_DQ_USER, nblks, rtblks); + if (error) + goto error0; } - if (XFS_IS_OQUOTA_ON(mp)) { - ASSERT(gdqp); - xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); - xfs_qm_dqput(gdqp); - } - /* - * Now release the inode. This will send it to 'inactive', and - * possibly even free blocks. - */ - IRELE(ip); - /* - * Goto next inode. - */ + if (XFS_IS_GQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid, + XFS_DQ_GROUP, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_PQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_projid, + XFS_DQ_PROJ, nblks, rtblks); + if (error) + goto error0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); *res = BULKSTAT_RV_DIDONE; return 0; + +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_GIVEUP; + return error; } /* From acecf1b5d8a846bf818bf74df454330f0b444b0a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Sep 2010 01:44:45 +0000 Subject: [PATCH 04/36] xfs: stop using xfs_qm_dqtobp in xfs_qm_dqflush In xfs_qm_dqflush we know that q_blkno must be initialized already from a previous xfs_qm_dqread. So instead of calling xfs_qm_dqtobp we can simply read the quota buffer directly. This also saves us from a duplicate xfs_qm_dqcheck call check and allows xfs_qm_dqtobp to be simplified now that it is always called for a newly initialized inode. In addition to that properly unwind all locks in xfs_qm_dqflush when xfs_qm_dqcheck fails. This mirrors a similar cleanup in the inode lookup done earlier. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/quota/xfs_dquot.c | 166 ++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 89 deletions(-) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index e1a2f6800e01..faf8e1a83a12 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -463,88 +463,69 @@ xfs_qm_dqtobp( uint flags) { xfs_bmbt_irec_t map; - int nmaps, error; + int nmaps = 1, error; xfs_buf_t *bp; - xfs_inode_t *quotip; - xfs_mount_t *mp; + xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); + xfs_mount_t *mp = dqp->q_mount; xfs_disk_dquot_t *ddq; - xfs_dqid_t id; - boolean_t newdquot; + xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); xfs_trans_t *tp = (tpp ? *tpp : NULL); - mp = dqp->q_mount; - id = be32_to_cpu(dqp->q_core.d_id); - nmaps = 1; - newdquot = B_FALSE; + dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; - /* - * If we don't know where the dquot lives, find out. - */ - if (dqp->q_blkno == (xfs_daddr_t) 0) { - /* We use the id as an index */ - dqp->q_fileoffset = (xfs_fileoff_t)id / - mp->m_quotainfo->qi_dqperchunk; - nmaps = 1; - quotip = XFS_DQ_TO_QIP(dqp); - xfs_ilock(quotip, XFS_ILOCK_SHARED); + xfs_ilock(quotip, XFS_ILOCK_SHARED); + if (XFS_IS_THIS_QUOTA_OFF(dqp)) { /* - * Return if this type of quotas is turned off while we didn't - * have an inode lock + * Return if this type of quotas is turned off while we + * didn't have the quota inode lock. */ - if (XFS_IS_THIS_QUOTA_OFF(dqp)) { - xfs_iunlock(quotip, XFS_ILOCK_SHARED); - return (ESRCH); - } - /* - * Find the block map; no allocations yet - */ - error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, - XFS_BMAPI_METADATA, - NULL, 0, &map, &nmaps, NULL); - xfs_iunlock(quotip, XFS_ILOCK_SHARED); - if (error) - return (error); - ASSERT(nmaps == 1); - ASSERT(map.br_blockcount == 1); - - /* - * offset of dquot in the (fixed sized) dquot chunk. - */ - dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * - sizeof(xfs_dqblk_t); - if (map.br_startblock == HOLESTARTBLOCK) { - /* - * We don't allocate unless we're asked to - */ - if (!(flags & XFS_QMOPT_DQALLOC)) - return (ENOENT); - - ASSERT(tp); - if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, - dqp->q_fileoffset, &bp))) - return (error); - tp = *tpp; - newdquot = B_TRUE; - } else { - /* - * store the blkno etc so that we don't have to do the - * mapping all the time - */ - dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - } + return ESRCH; } - ASSERT(dqp->q_blkno != DELAYSTARTBLOCK); - ASSERT(dqp->q_blkno != HOLESTARTBLOCK); /* - * Read in the buffer, unless we've just done the allocation - * (in which case we already have the buf). + * Find the block map; no allocations yet */ - if (!newdquot) { + error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + NULL, 0, &map, &nmaps, NULL); + + xfs_iunlock(quotip, XFS_ILOCK_SHARED); + if (error) + return error; + + ASSERT(nmaps == 1); + ASSERT(map.br_blockcount == 1); + + /* + * Offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * + sizeof(xfs_dqblk_t); + + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (map.br_startblock == HOLESTARTBLOCK) { + /* + * We don't allocate unless we're asked to + */ + if (!(flags & XFS_QMOPT_DQALLOC)) + return ENOENT; + + ASSERT(tp); + error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, + dqp->q_fileoffset, &bp); + if (error) + return error; + tp = *tpp; + } else { trace_xfs_dqtobp_read(dqp); + /* + * store the blkno etc so that we don't have to do the + * mapping all the time + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, @@ -552,13 +533,14 @@ xfs_qm_dqtobp( if (error || !bp) return XFS_ERROR(error); } + ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); /* * calculate the location of the dquot inside the buffer. */ - ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset); + ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); /* * A simple sanity check in case we got a corrupted dquot... @@ -1176,18 +1158,18 @@ xfs_qm_dqflush( xfs_dquot_t *dqp, uint flags) { - xfs_mount_t *mp; - xfs_buf_t *bp; - xfs_disk_dquot_t *ddqp; + struct xfs_mount *mp = dqp->q_mount; + struct xfs_buf *bp; + struct xfs_disk_dquot *ddqp; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); + trace_xfs_dqflush(dqp); /* - * If not dirty, or it's pinned and we are not supposed to - * block, nada. + * If not dirty, or it's pinned and we are not supposed to block, nada. */ if (!XFS_DQ_IS_DIRTY(dqp) || (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) { @@ -1201,40 +1183,46 @@ xfs_qm_dqflush( * down forcibly. If that's the case we must not write this dquot * to disk, because the log record didn't make it to disk! */ - if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { - dqp->dq_flags &= ~(XFS_DQ_DIRTY); + if (XFS_FORCED_SHUTDOWN(mp)) { + dqp->dq_flags &= ~XFS_DQ_DIRTY; xfs_dqfunlock(dqp); return XFS_ERROR(EIO); } /* * Get the buffer containing the on-disk dquot - * We don't need a transaction envelope because we know that the - * the ondisk-dquot has already been allocated for. */ - if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) { ASSERT(error != ENOENT); - /* - * Quotas could have gotten turned off (ESRCH) - */ xfs_dqfunlock(dqp); - return (error); + return error; } - if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), - 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { - xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE); + /* + * Calculate the location of the dquot inside the buffer. + */ + ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + + /* + * A simple sanity check in case we got a corrupted dquot.. + */ + if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0, + XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { + xfs_buf_relse(bp); + xfs_dqfunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return XFS_ERROR(EIO); } /* This is the only portion of data that needs to persist */ - memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); + memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); /* * Clear the dirty field and remember the flush lsn for later use. */ - dqp->dq_flags &= ~(XFS_DQ_DIRTY); - mp = dqp->q_mount; + dqp->dq_flags &= ~XFS_DQ_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); From c0e59e1ac0a106bbab93404024bb6e7927ad9d6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Sep 2010 23:34:07 +0000 Subject: [PATCH 05/36] xfs: remove the ->kill_root btree operation The implementation os ->kill_root only differ by either simply zeroing out the now unused buffer in the btree cursor in the inode allocation btree or using xfs_btree_setbuf in the allocation btree. Initially both of them used xfs_btree_setbuf, but the use in the ialloc btree was removed early on because it interacted badly with xfs_trans_binval. In addition to zeroing out the buffer in the cursor xfs_btree_setbuf updates the bc_ra array in the btree cursor, and calls xfs_trans_brelse on the buffer previous occupying the slot. The bc_ra update should be done for the alloc btree updated too, although the lack of it does not cause serious problems. The xfs_trans_brelse call on the other hand is effectively a no-op in the end - it keeps decrementing the bli_recur refcount until it hits zero, and then just skips out because the buffer will always be dirty at this point. So removing it for the allocation btree is just fine. So unify the code and move it to xfs_btree.c. While we're at it also replace the call to xfs_btree_setbuf with a NULL bp argument in xfs_btree_del_cursor with a direct call to xfs_trans_brelse given that the cursor is beeing freed just after this and the state updates are superflous. After this xfs_btree_setbuf is only used with a non-NULL bp argument and can thus be simplified. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_alloc_btree.c | 33 ------------------------- fs/xfs/xfs_btree.c | 52 ++++++++++++++++++++++++++++++++------- fs/xfs/xfs_btree.h | 14 +---------- fs/xfs/xfs_ialloc_btree.c | 33 ------------------------- 4 files changed, 44 insertions(+), 88 deletions(-) diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 97f7328967fd..3916925e2584 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -280,38 +280,6 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -STATIC int -xfs_allocbt_kill_root( - struct xfs_btree_cur *cur, - struct xfs_buf *bp, - int level, - union xfs_btree_ptr *newroot) -{ - int error; - - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_STATS_INC(cur, killroot); - - /* - * Update the root pointer, decreasing the level by 1 and then - * free the old root. - */ - xfs_allocbt_set_root(cur, newroot, -1); - error = xfs_allocbt_free_block(cur, bp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); - return error; - } - - XFS_BTREE_STATS_INC(cur, free); - - xfs_btree_setbuf(cur, level, NULL); - cur->bc_nlevels--; - - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); - return 0; -} - #ifdef DEBUG STATIC int xfs_allocbt_keys_inorder( @@ -423,7 +391,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .dup_cursor = xfs_allocbt_dup_cursor, .set_root = xfs_allocbt_set_root, - .kill_root = xfs_allocbt_kill_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, .update_lastrec = xfs_allocbt_update_lastrec, diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 829af92f0fba..735dc2e671b1 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -217,7 +217,7 @@ xfs_btree_del_cursor( */ for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_bufs[i]) - xfs_btree_setbuf(cur, i, NULL); + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); else if (!error) break; } @@ -763,22 +763,19 @@ xfs_btree_readahead( * Set the buffer for level "lev" in the cursor to bp, releasing * any previous buffer. */ -void +STATIC void xfs_btree_setbuf( xfs_btree_cur_t *cur, /* btree cursor */ int lev, /* level in btree */ xfs_buf_t *bp) /* new buffer to set */ { struct xfs_btree_block *b; /* btree block */ - xfs_buf_t *obp; /* old buffer pointer */ - obp = cur->bc_bufs[lev]; - if (obp) - xfs_trans_brelse(cur->bc_tp, obp); + if (cur->bc_bufs[lev]) + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]); cur->bc_bufs[lev] = bp; cur->bc_ra[lev] = 0; - if (!bp) - return; + b = XFS_BUF_TO_BLOCK(bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) @@ -3011,6 +3008,43 @@ out0: return 0; } +/* + * Kill the current root node, and replace it with it's only child node. + */ +STATIC int +xfs_btree_kill_root( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + union xfs_btree_ptr *newroot) +{ + int error; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, killroot); + + /* + * Update the root pointer, decreasing the level by 1 and then + * free the old root. + */ + cur->bc_ops->set_root(cur, newroot, -1); + + error = cur->bc_ops->free_block(cur, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level] = NULL; + cur->bc_ra[level] = 0; + cur->bc_nlevels--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + STATIC int xfs_btree_dec_cursor( struct xfs_btree_cur *cur, @@ -3195,7 +3229,7 @@ xfs_btree_delrec( * Make it the new root of the btree. */ pp = xfs_btree_ptr_addr(cur, 1, block); - error = cur->bc_ops->kill_root(cur, bp, level, pp); + error = xfs_btree_kill_root(cur, bp, level, pp); if (error) goto error0; } else if (level > 0) { diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 7fa07062bdda..82fafc66bd1f 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -152,9 +152,7 @@ struct xfs_btree_ops { /* update btree root pointer */ void (*set_root)(struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, int level_change); - int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp, - int level, union xfs_btree_ptr *newroot); + union xfs_btree_ptr *nptr, int level_change); /* block allocation / freeing */ int (*alloc_block)(struct xfs_btree_cur *cur, @@ -399,16 +397,6 @@ xfs_btree_reada_bufs( xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count); /* count of filesystem blocks */ -/* - * Set the buffer for level "lev" in the cursor to bp, releasing - * any previous buffer. - */ -void -xfs_btree_setbuf( - xfs_btree_cur_t *cur, /* btree cursor */ - int lev, /* level in btree */ - struct xfs_buf *bp); /* new buffer to set */ - /* * Common btree core entry points. diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index d352862cefa0..16921f55c542 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -183,38 +183,6 @@ xfs_inobt_key_diff( cur->bc_rec.i.ir_startino; } -STATIC int -xfs_inobt_kill_root( - struct xfs_btree_cur *cur, - struct xfs_buf *bp, - int level, - union xfs_btree_ptr *newroot) -{ - int error; - - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_STATS_INC(cur, killroot); - - /* - * Update the root pointer, decreasing the level by 1 and then - * free the old root. - */ - xfs_inobt_set_root(cur, newroot, -1); - error = xfs_inobt_free_block(cur, bp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); - return error; - } - - XFS_BTREE_STATS_INC(cur, free); - - cur->bc_bufs[level] = NULL; - cur->bc_nlevels--; - - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); - return 0; -} - #ifdef DEBUG STATIC int xfs_inobt_keys_inorder( @@ -309,7 +277,6 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_inobt_set_root, - .kill_root = xfs_inobt_kill_root, .alloc_block = xfs_inobt_alloc_block, .free_block = xfs_inobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, From 9c169915ad374cd9efb1556943b2074ec07e1749 Mon Sep 17 00:00:00 2001 From: Poyo VL Date: Thu, 2 Sep 2010 07:41:55 +0000 Subject: [PATCH 06/36] xfs: eliminate some newly-reported gcc warnings Ionut Gabriel Popescu submitted a simple change to eliminate some "may be used uninitialized" warnings when building XFS. The reported condition seems to be something that GCC did not used to recognize or report. The warnings were produced by: gcc version 4.5.0 20100604 [gcc-4_5-branch revision 160292] (SUSE Linux) Signed-off-by: Ionut Gabriel Popescu Signed-off-by: Alex Elder --- fs/xfs/xfs_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index af168faccc7a..112abc439ca5 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -675,7 +675,7 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t gtbnoa; /* aligned ... */ xfs_extlen_t gtdiff; /* difference to right side entry */ xfs_extlen_t gtlen; /* length of right side entry */ - xfs_extlen_t gtlena; /* aligned ... */ + xfs_extlen_t gtlena = 0; /* aligned ... */ xfs_agblock_t gtnew; /* useful start bno of right side */ int error; /* error code */ int i; /* result code, temporary */ @@ -684,7 +684,7 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t ltbnoa; /* aligned ... */ xfs_extlen_t ltdiff; /* difference to left side entry */ xfs_extlen_t ltlen; /* length of left side entry */ - xfs_extlen_t ltlena; /* aligned ... */ + xfs_extlen_t ltlena = 0; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ #if defined(DEBUG) && defined(__KERNEL__) From d1583a3833290ab9f8b13a064acbb5e508c59f60 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 18:14:13 +1000 Subject: [PATCH 07/36] xfs: reduce the number of CIL lock round trips during commit When commiting a transaction, we do a lock CIL state lock round trip on every single log vector we insert into the CIL. This is resulting in the lock being as hot as the inode and dcache locks on 8-way create workloads. Rework the insertion loops to bring the number of lock round trips to one per transaction for log vectors, and one more do the busy extents. Also change the allocation of the log vector buffer not to zero it as we copy over the entire allocated buffer anyway. This patch also includes a structural cleanup to the CIL item insertion provided by Christoph Hellwig. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_log_cil.c | 232 +++++++++++++++++++++++-------------------- 1 file changed, 127 insertions(+), 105 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 7e206fc1fa36..23d6ceb5e97b 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -145,102 +145,6 @@ xlog_cil_init_post_recovery( log->l_curr_block); } -/* - * Insert the log item into the CIL and calculate the difference in space - * consumed by the item. Add the space to the checkpoint ticket and calculate - * if the change requires additional log metadata. If it does, take that space - * as well. Remove the amount of space we addded to the checkpoint ticket from - * the current transaction ticket so that the accounting works out correctly. - * - * If this is the first time the item is being placed into the CIL in this - * context, pin it so it can't be written to disk until the CIL is flushed to - * the iclog and the iclog written to disk. - */ -static void -xlog_cil_insert( - struct log *log, - struct xlog_ticket *ticket, - struct xfs_log_item *item, - struct xfs_log_vec *lv) -{ - struct xfs_cil *cil = log->l_cilp; - struct xfs_log_vec *old = lv->lv_item->li_lv; - struct xfs_cil_ctx *ctx = cil->xc_ctx; - int len; - int diff_iovecs; - int iclog_space; - - if (old) { - /* existing lv on log item, space used is a delta */ - ASSERT(!list_empty(&item->li_cil)); - ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); - - len = lv->lv_buf_len - old->lv_buf_len; - diff_iovecs = lv->lv_niovecs - old->lv_niovecs; - kmem_free(old->lv_buf); - kmem_free(old); - } else { - /* new lv, must pin the log item */ - ASSERT(!lv->lv_item->li_lv); - ASSERT(list_empty(&item->li_cil)); - - len = lv->lv_buf_len; - diff_iovecs = lv->lv_niovecs; - IOP_PIN(lv->lv_item); - - } - len += diff_iovecs * sizeof(xlog_op_header_t); - - /* attach new log vector to log item */ - lv->lv_item->li_lv = lv; - - spin_lock(&cil->xc_cil_lock); - list_move_tail(&item->li_cil, &cil->xc_cil); - ctx->nvecs += diff_iovecs; - - /* - * If this is the first time the item is being committed to the CIL, - * store the sequence number on the log item so we can tell - * in future commits whether this is the first checkpoint the item is - * being committed into. - */ - if (!item->li_seq) - item->li_seq = ctx->sequence; - - /* - * Now transfer enough transaction reservation to the context ticket - * for the checkpoint. The context ticket is special - the unit - * reservation has to grow as well as the current reservation as we - * steal from tickets so we can correctly determine the space used - * during the transaction commit. - */ - if (ctx->ticket->t_curr_res == 0) { - /* first commit in checkpoint, steal the header reservation */ - ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); - ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; - ticket->t_curr_res -= ctx->ticket->t_unit_res; - } - - /* do we need space for more log record headers? */ - iclog_space = log->l_iclog_size - log->l_iclog_hsize; - if (len > 0 && (ctx->space_used / iclog_space != - (ctx->space_used + len) / iclog_space)) { - int hdrs; - - hdrs = (len + iclog_space - 1) / iclog_space; - /* need to take into account split region headers, too */ - hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); - ctx->ticket->t_unit_res += hdrs; - ctx->ticket->t_curr_res += hdrs; - ticket->t_curr_res -= hdrs; - ASSERT(ticket->t_curr_res >= len); - } - ticket->t_curr_res -= len; - ctx->space_used += len; - - spin_unlock(&cil->xc_cil_lock); -} - /* * Format log item into a flat buffers * @@ -286,7 +190,7 @@ xlog_cil_format_items( len += lv->lv_iovecp[index].i_len; lv->lv_buf_len = len; - lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); + lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); ptr = lv->lv_buf; for (index = 0; index < lv->lv_niovecs; index++) { @@ -300,21 +204,136 @@ xlog_cil_format_items( } } +/* + * Prepare the log item for insertion into the CIL. Calculate the difference in + * log space and vectors it will consume, and if it is a new item pin it as + * well. + */ +STATIC void +xfs_cil_prepare_item( + struct log *log, + struct xfs_log_vec *lv, + int *len, + int *diff_iovecs) +{ + struct xfs_log_vec *old = lv->lv_item->li_lv; + + if (old) { + /* existing lv on log item, space used is a delta */ + ASSERT(!list_empty(&lv->lv_item->li_cil)); + ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + + *len += lv->lv_buf_len - old->lv_buf_len; + *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; + kmem_free(old->lv_buf); + kmem_free(old); + } else { + /* new lv, must pin the log item */ + ASSERT(!lv->lv_item->li_lv); + ASSERT(list_empty(&lv->lv_item->li_cil)); + + *len += lv->lv_buf_len; + *diff_iovecs += lv->lv_niovecs; + IOP_PIN(lv->lv_item); + + } + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + /* + * If this is the first time the item is being committed to the + * CIL, store the sequence number on the log item so we can + * tell in future commits whether this is the first checkpoint + * the item is being committed into. + */ + if (!lv->lv_item->li_seq) + lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; +} + +/* + * Insert the log items into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we addded to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + */ static void xlog_cil_insert_items( struct log *log, struct xfs_log_vec *log_vector, - struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn) + struct xlog_ticket *ticket) { - struct xfs_log_vec *lv; - - if (start_lsn) - *start_lsn = log->l_cilp->xc_ctx->sequence; + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + struct xfs_log_vec *lv; + int len = 0; + int diff_iovecs = 0; + int iclog_space; ASSERT(log_vector); + + /* + * Do all the accounting aggregation and switching of log vectors + * around in a separate loop to the insertion of items into the CIL. + * Then we can do a separate loop to update the CIL within a single + * lock/unlock pair. This reduces the number of round trips on the CIL + * lock from O(nr_logvectors) to O(1) and greatly reduces the overall + * hold time for the transaction commit. + * + * If this is the first time the item is being placed into the CIL in + * this context, pin it so it can't be written to disk until the CIL is + * flushed to the iclog and the iclog written to disk. + * + * We can do this safely because the context can't checkpoint until we + * are done so it doesn't matter exactly how we update the CIL. + */ for (lv = log_vector; lv; lv = lv->lv_next) - xlog_cil_insert(log, ticket, lv->lv_item, lv); + xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); + + /* account for space used by new iovec headers */ + len += diff_iovecs * sizeof(xlog_op_header_t); + + spin_lock(&cil->xc_cil_lock); + + /* move the items to the tail of the CIL */ + for (lv = log_vector; lv; lv = lv->lv_next) + list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); + + ctx->nvecs += diff_iovecs; + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + /* first commit in checkpoint, steal the header reservation */ + ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + ticket->t_curr_res -= hdrs; + ASSERT(ticket->t_curr_res >= len); + } + ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); } static void @@ -638,7 +657,10 @@ xfs_log_commit_cil( /* lock out background commit */ down_read(&log->l_cilp->xc_ctx_lock); - xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); + if (commit_lsn) + *commit_lsn = log->l_cilp->xc_ctx->sequence; + + xlog_cil_insert_items(log, log_vector, tp->t_ticket); /* check we didn't blow the reservation */ if (tp->t_ticket->t_curr_res < 0) From bd32d25a7cf7242512e77e70bab63df4402ab91c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 22 Sep 2010 10:47:20 +1000 Subject: [PATCH 08/36] xfs: remove debug assert for per-ag reference counting When we start taking references per cached buffer to the the perag it is cached on, it will blow the current debug maximum reference count assert out of the water. The assert has never caught a bug, and we have tracing to track changes if there ever is a problem, so just remove it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_mount.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aeb9d72ebf6e..00c7a876807d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -210,8 +210,6 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) pag = radix_tree_lookup(&mp->m_perag_tree, agno); if (pag) { ASSERT(atomic_read(&pag->pag_ref) >= 0); - /* catch leaks in the positive direction during testing */ - ASSERT(atomic_read(&pag->pag_ref) < 1000); ref = atomic_inc_return(&pag->pag_ref); } spin_unlock(&mp->m_perag_lock); From e176579e70118ed7cfdb60f963628fe0ca771f3d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 22 Sep 2010 10:47:20 +1000 Subject: [PATCH 09/36] xfs: lockless per-ag lookups When we start taking a reference to the per-ag for every cached buffer in the system, kernel lockstat profiling on an 8-way create workload shows the mp->m_perag_lock has higher acquisition rates than the inode lock and has significantly more contention. That is, it becomes the highest contended lock in the system. The perag lookup is trivial to convert to lock-less RCU lookups because perag structures never go away. Hence the only thing we need to protect against is tree structure changes during a grow. This can be done simply by replacing the locking in xfs_perag_get() with RCU read locking. This removes the mp->m_perag_lock completely from this path. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 6 +++--- fs/xfs/xfs_ag.h | 3 +++ fs/xfs/xfs_mount.c | 25 +++++++++++++++++-------- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 81976ffed7d6..3a1d229b4784 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -150,17 +150,17 @@ xfs_inode_ag_iter_next_pag( int found; int ref; - spin_lock(&mp->m_perag_lock); + rcu_read_lock(); found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, (void **)&pag, *first, 1, tag); if (found <= 0) { - spin_unlock(&mp->m_perag_lock); + rcu_read_unlock(); return NULL; } *first = pag->pag_agno + 1; /* open coded pag reference increment */ ref = atomic_inc_return(&pag->pag_ref); - spin_unlock(&mp->m_perag_lock); + rcu_read_unlock(); trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_); } else { pag = xfs_perag_get(mp, *first); diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 4917d4eed4ed..51c42c202bf1 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -230,6 +230,9 @@ typedef struct xfs_perag { rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; #endif int pagb_count; /* pagb slots in use */ } xfs_perag_t; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 00c7a876807d..14fc6e9e1816 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -199,6 +199,8 @@ xfs_uuid_unmount( /* * Reference counting access wrappers to the perag structures. + * Because we never free per-ag structures, the only thing we + * have to protect against changes is the tree structure itself. */ struct xfs_perag * xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) @@ -206,13 +208,13 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) struct xfs_perag *pag; int ref = 0; - spin_lock(&mp->m_perag_lock); + rcu_read_lock(); pag = radix_tree_lookup(&mp->m_perag_tree, agno); if (pag) { ASSERT(atomic_read(&pag->pag_ref) >= 0); ref = atomic_inc_return(&pag->pag_ref); } - spin_unlock(&mp->m_perag_lock); + rcu_read_unlock(); trace_xfs_perag_get(mp, agno, ref, _RET_IP_); return pag; } @@ -227,10 +229,18 @@ xfs_perag_put(struct xfs_perag *pag) trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); } +STATIC void +__xfs_free_perag( + struct rcu_head *head) +{ + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + + ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); +} + /* - * Free up the resources associated with a mount structure. Assume that - * the structure was initially zeroed, so we can tell which fields got - * initialized. + * Free up the per-ag resources associated with the mount structure. */ STATIC void xfs_free_perag( @@ -242,10 +252,9 @@ xfs_free_perag( for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, agno); - ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); spin_unlock(&mp->m_perag_lock); - kmem_free(pag); + ASSERT(pag); + call_rcu(&pag->rcu_head, __xfs_free_perag); } } From dcd79a1423f64ee0184629874805c3ac40f3a2c5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 28 Sep 2010 12:27:25 +1000 Subject: [PATCH 10/36] xfs: don't use vfs writeback for pure metadata modifications Under heavy multi-way parallel create workloads, the VFS struggles to write back all the inodes that have been changed in age order. The bdi flusher thread becomes CPU bound, spending 85% of it's time in the VFS code, mostly traversing the superblock dirty inode list to separate dirty inodes old enough to flush. We already keep an index of all metadata changes in age order - in the AIL - and continued log pressure will do age ordered writeback without any extra overhead at all. If there is no pressure on the log, the xfssyncd will periodically write back metadata in ascending disk address offset order so will be very efficient. Hence we can stop marking VFS inodes dirty during transaction commit or when changing timestamps during transactions. This will keep the inodes in the superblock dirty list to those containing data or unlogged metadata changes. However, the timstamp changes are slightly more complex than this - there are a couple of places that do unlogged updates of the timestamps, and the VFS need to be informed of these. Hence add a new function xfs_trans_ichgtime() for transactional changes, and leave xfs_ichgtime() for the non-transactional changes. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_ioctl.c | 2 +- fs/xfs/linux-2.6/xfs_iops.c | 35 ---------------------------------- fs/xfs/linux-2.6/xfs_super.c | 7 +------ fs/xfs/quota/xfs_qm_syscalls.c | 2 +- fs/xfs/xfs_attr.c | 31 +++++++++++------------------- fs/xfs/xfs_inode.h | 1 - fs/xfs/xfs_inode_item.c | 9 --------- fs/xfs/xfs_rename.c | 12 ++++++++---- fs/xfs/xfs_trans.h | 1 + fs/xfs/xfs_trans_inode.c | 30 +++++++++++++++++++++++++++++ fs/xfs/xfs_utils.c | 4 ++-- fs/xfs/xfs_vnodeops.c | 17 ++++++++++------- 12 files changed, 65 insertions(+), 86 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 03aa908a9cb9..10206be7a077 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -1088,8 +1088,8 @@ xfs_ioctl_setattr( xfs_diflags_to_linux(ip); } + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); XFS_STATS_INC(xs_ig_attrchg); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index b1fc2a6bfe83..a788f016d1fa 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -94,41 +94,6 @@ xfs_mark_inode_dirty( mark_inode_dirty(inode); } -/* - * Change the requested timestamp in the given inode. - * We don't lock across timestamp updates, and we don't log them but - * we do record the fact that there is dirty information in core. - */ -void -xfs_ichgtime( - xfs_inode_t *ip, - int flags) -{ - struct inode *inode = VFS_I(ip); - timespec_t tv; - int sync_it = 0; - - tv = current_fs_time(inode->i_sb); - - if ((flags & XFS_ICHGTIME_MOD) && - !timespec_equal(&inode->i_mtime, &tv)) { - inode->i_mtime = tv; - sync_it = 1; - } - if ((flags & XFS_ICHGTIME_CHG) && - !timespec_equal(&inode->i_ctime, &tv)) { - inode->i_ctime = tv; - sync_it = 1; - } - - /* - * Update complete - now make sure everyone knows that the inode - * is dirty. - */ - if (sync_it) - xfs_mark_inode_dirty_sync(ip); -} - /* * Hook in SELinux. This is not quite correct yet, what we really need * here (as we do for default ACLs) is a mechanism by which creation of diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a4e07974955b..83154c0a3175 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -972,12 +972,7 @@ xfs_fs_inode_init_once( /* * Dirty the XFS inode when mark_inode_dirty_sync() is called so that - * we catch unlogged VFS level updates to the inode. Care must be taken - * here - the transaction code calls mark_inode_dirty_sync() to mark the - * VFS inode dirty in a transaction and clears the i_update_core field; - * it must clear the field after calling mark_inode_dirty_sync() to - * correctly indicate that the dirty state has been propagated into the - * inode log item. + * we catch unlogged VFS level updates to the inode. * * We need the barrier() to maintain correct ordering between unlogged * updates and the transaction commit code that clears the i_update_core diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 45e5849df238..7a71336f7922 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -276,7 +276,7 @@ xfs_qm_scall_trunc_qfile( goto out_unlock; } - xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); out_unlock: diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index c2568242a901..905d390c1e5c 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -355,16 +355,15 @@ xfs_attr_set_int( if (mp->m_flags & XFS_MOUNT_WSYNC) { xfs_trans_set_sync(args.trans); } + + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_trans_ichgtime(args.trans, dp, + XFS_ICHGTIME_CHG); + } err2 = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - /* - * Hit the inode change time. - */ - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_ichgtime(dp, XFS_ICHGTIME_CHG); - } return(error == 0 ? err2 : error); } @@ -420,6 +419,9 @@ xfs_attr_set_int( xfs_trans_set_sync(args.trans); } + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + /* * Commit the last in the sequence of transactions. */ @@ -427,13 +429,6 @@ xfs_attr_set_int( error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - /* - * Hit the inode change time. - */ - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_ichgtime(dp, XFS_ICHGTIME_CHG); - } - return(error); out: @@ -567,6 +562,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) xfs_trans_set_sync(args.trans); } + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + /* * Commit the last in the sequence of transactions. */ @@ -574,13 +572,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - /* - * Hit the inode change time. - */ - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_ichgtime(dp, XFS_ICHGTIME_CHG); - } - return(error); out: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0898c5417d12..54781fcd322a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -471,7 +471,6 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); -void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index fe00777e2796..c7ac020705df 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -222,15 +222,6 @@ xfs_inode_item_format( vecp++; nvecs = 1; - /* - * Make sure the linux inode is dirty. We do this before - * clearing i_update_core as the VFS will call back into - * XFS here and set i_update_core, so we need to dirty the - * inode first so that the ordering of i_update_core and - * unlogged modifications still works as described below. - */ - xfs_mark_inode_dirty_sync(ip); - /* * Clear i_update_core if the timestamps (or any other * non-transactional modification) need flushing/logging diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 8fca957200df..9028733f7ed8 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -211,7 +211,9 @@ xfs_rename( goto error_return; if (error) goto abort_return; - xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); @@ -249,7 +251,9 @@ xfs_rename( &first_block, &free_list, spaceres); if (error) goto abort_return; - xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* * Decrement the link count on the target since the target @@ -292,7 +296,7 @@ xfs_rename( * inode isn't really being changed, but old unix file systems did * it and some incremental backup programs won't work without it. */ - xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); /* * Adjust the link count on src_dp. This is necessary when @@ -315,7 +319,7 @@ xfs_rename( if (error) goto abort_return; - xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c13c0f97b494..e2cbe4da7d8e 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -473,6 +473,7 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, xfs_ino_t , uint, uint, struct xfs_inode **); +void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index cdc53a1050c5..ccb34532768b 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -117,6 +117,36 @@ xfs_trans_ijoin_ref( ip->i_itemp->ili_lock_flags = lock_flags; } +/* + * Transactional inode timestamp update. Requires the inode to be locked and + * joined to the transaction supplied. Relies on the transaction subsystem to + * track dirty state and update/writeback the inode accordingly. + */ +void +xfs_trans_ichgtime( + struct xfs_trans *tp, + struct xfs_inode *ip, + int flags) +{ + struct inode *inode = VFS_I(ip); + timespec_t tv; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(ip->i_transp == tp); + + tv = current_fs_time(inode->i_sb); + + if ((flags & XFS_ICHGTIME_MOD) && + !timespec_equal(&inode->i_mtime, &tv)) { + inode->i_mtime = tv; + } + if ((flags & XFS_ICHGTIME_CHG) && + !timespec_equal(&inode->i_ctime, &tv)) { + inode->i_ctime = tv; + } +} + /* * This is called to mark the fields indicated in fieldmask as needing * to be logged when the transaction is committed. The inode must diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index b7d5769d2df0..4c2ba6f6adc8 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -235,7 +235,7 @@ xfs_droplink( { int error; - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); ASSERT (ip->i_d.di_nlink > 0); ip->i_d.di_nlink--; @@ -299,7 +299,7 @@ xfs_bumplink( { if (ip->i_d.di_nlink >= XFS_MAXLINK) return XFS_ERROR(EMLINK); - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); ASSERT(ip->i_d.di_nlink > 0); ip->i_d.di_nlink++; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index dc6e4fb8bbc9..a230cd4d0774 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -184,8 +184,11 @@ xfs_setattr( ip->i_size == 0 && ip->i_d.di_nextents == 0) { xfs_iunlock(ip, XFS_ILOCK_EXCL); lock_flags &= ~XFS_ILOCK_EXCL; - if (mask & ATTR_CTIME) - xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + if (mask & ATTR_CTIME) { + inode->i_mtime = inode->i_ctime = + current_fs_time(inode->i_sb); + xfs_mark_inode_dirty_sync(ip); + } code = 0; goto error_return; } @@ -1391,7 +1394,7 @@ xfs_create( ASSERT(error != ENOSPC); goto out_trans_abort; } - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); if (is_dir) { @@ -1742,7 +1745,7 @@ xfs_remove( ASSERT(error != ENOENT); goto out_bmap_cancel; } - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (is_dir) { /* @@ -1895,7 +1898,7 @@ xfs_link( &first_block, &free_list, resblks); if (error) goto abort_return; - xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); error = xfs_bumplink(tp, sip); @@ -2129,7 +2132,7 @@ xfs_symlink( &first_block, &free_list, resblks); if (error) goto error1; - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); /* @@ -2833,7 +2836,7 @@ xfs_change_file_space( if (ip->i_d.di_mode & S_IXGRP) ip->i_d.di_mode &= ~S_ISGID; - xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); } if (setprealloc) ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; From 686865f76e35b28ba7aa6afa19209426f0da6201 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 20:07:47 +1000 Subject: [PATCH 11/36] xfs: rename xfs_buf_get_nodaddr to be more appropriate xfs_buf_get_nodaddr() is really used to allocate a buffer that is uncached. While it is not directly assigned a disk address, the fact that they are not cached is a more important distinction. With the upcoming uncached buffer read primitive, we should be consistent with this disctinction. While there, make page allocation in xfs_buf_get_nodaddr() safe against memory reclaim re-entrancy into the filesystem by allowing a flags parameter to be passed. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 9 +++++---- fs/xfs/linux-2.6/xfs_buf.h | 2 +- fs/xfs/linux-2.6/xfs_trace.h | 2 +- fs/xfs/xfs_log.c | 3 ++- fs/xfs/xfs_log_recover.c | 3 ++- fs/xfs/xfs_vnodeops.c | 6 +++--- 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 286e36e21dae..eca945b0f88f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -707,9 +707,10 @@ xfs_buf_associate_memory( } xfs_buf_t * -xfs_buf_get_noaddr( +xfs_buf_get_uncached( + struct xfs_buftarg *target, size_t len, - xfs_buftarg_t *target) + int flags) { unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; int error, i; @@ -725,7 +726,7 @@ xfs_buf_get_noaddr( goto fail_free_buf; for (i = 0; i < page_count; i++) { - bp->b_pages[i] = alloc_page(GFP_KERNEL); + bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); if (!bp->b_pages[i]) goto fail_free_mem; } @@ -740,7 +741,7 @@ xfs_buf_get_noaddr( xfs_buf_unlock(bp); - trace_xfs_buf_get_noaddr(bp, _RET_IP_); + trace_xfs_buf_get_uncached(bp, _RET_IP_); return bp; fail_free_mem: diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 2a05614f0b92..fb30447091d8 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -213,7 +213,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); -extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); +extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); extern void xfs_buf_hold(xfs_buf_t *); extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index be5dffd282a1..2a1d4fbd9ed8 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -331,7 +331,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); DEFINE_BUF_EVENT(xfs_buf_delwri_split); -DEFINE_BUF_EVENT(xfs_buf_get_noaddr); +DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 33f718f92a48..c8a309424307 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1131,7 +1131,8 @@ xlog_alloc_log(xfs_mount_t *mp, iclog->ic_prev = prev_iclog; prev_iclog = iclog; - bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); + bp = xfs_buf_get_uncached(mp->m_logdev_targp, + log->l_iclog_size, 0); if (!bp) goto out_free_iclog; if (!XFS_BUF_CPSEMA(bp)) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 6f3f5fa37acf..3d887542b037 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -107,7 +107,8 @@ xlog_get_bp( nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); + return xfs_buf_get_uncached(log->l_mp->m_logdev_targp, + BBTOB(nbblks), 0); } STATIC void diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index a230cd4d0774..b7bdc43308e4 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2434,9 +2434,9 @@ xfs_zero_remaining_bytes( if (endoff > ip->i_size) endoff = ip->i_size; - bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp); + bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + mp->m_sb.sb_blocksize, XBF_DONT_BLOCK); if (!bp) return XFS_ERROR(ENOMEM); From 5adc94c247c3779782c7b0b8b5e28cf50596eb37 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 21:58:31 +1000 Subject: [PATCH 12/36] xfs: introduced uncached buffer read primitve To avoid the need to use cached buffers for single-shot or buffers cached at the filesystem level, introduce a new buffer read primitive that bypasses the cache an reads directly from disk. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 34 ++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_buf.h | 3 +++ 2 files changed, 37 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index eca945b0f88f..22c7bff77ad2 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -638,6 +638,40 @@ xfs_buf_readahead( xfs_buf_read(target, ioff, isize, flags); } +/* + * Read an uncached buffer from disk. Allocates and returns a locked + * buffer containing the disk contents or nothing. + */ +struct xfs_buf * +xfs_buf_read_uncached( + struct xfs_mount *mp, + struct xfs_buftarg *target, + xfs_daddr_t daddr, + size_t length, + int flags) +{ + xfs_buf_t *bp; + int error; + + bp = xfs_buf_get_uncached(target, length, flags); + if (!bp) + return NULL; + + /* set up the buffer for a read IO */ + xfs_buf_lock(bp); + XFS_BUF_SET_ADDR(bp, daddr); + XFS_BUF_READ(bp); + XFS_BUF_BUSY(bp); + + xfsbdstrat(mp, bp); + error = xfs_iowait(bp); + if (error || bp->b_error) { + xfs_buf_relse(bp); + return NULL; + } + return bp; +} + xfs_buf_t * xfs_buf_get_empty( size_t len, diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index fb30447091d8..57eedc750ee6 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -218,6 +218,9 @@ extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); extern void xfs_buf_hold(xfs_buf_t *); extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); +struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, + struct xfs_buftarg *target, + xfs_daddr_t daddr, size_t length, int flags); /* Releasing Buffers */ extern void xfs_buf_free(xfs_buf_t *); From ebad861b5702c3e2332a3e906978f47144d22f70 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 22 Sep 2010 10:47:20 +1000 Subject: [PATCH 13/36] xfs: store xfs_mount in the buftarg instead of in the xfs_buf Each buffer contains both a buftarg pointer and a mount pointer. If we add a mount pointer into the buftarg, we can avoid needing the b_mount field in every buffer and grab it from the buftarg when needed instead. This shrinks the xfs_buf by 8 bytes. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 9 ++++----- fs/xfs/linux-2.6/xfs_buf.h | 5 +++-- fs/xfs/linux-2.6/xfs_super.c | 8 +++++--- fs/xfs/xfs_buf_item.c | 3 +-- fs/xfs/xfs_log_recover.c | 16 +++++++--------- 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 22c7bff77ad2..d6928970097f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -894,7 +894,7 @@ xfs_buf_lock( trace_xfs_buf_lock(bp, _RET_IP_); if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) - xfs_log_force(bp->b_mount, 0); + xfs_log_force(bp->b_target->bt_mount, 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); @@ -1017,7 +1017,6 @@ xfs_bwrite( { int error; - bp->b_mount = mp; bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ); @@ -1038,8 +1037,6 @@ xfs_bdwrite( { trace_xfs_buf_bdwrite(bp, _RET_IP_); - bp->b_mount = mp; - bp->b_flags &= ~XBF_READ; bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); @@ -1128,7 +1125,7 @@ int xfs_bdstrat_cb( struct xfs_buf *bp) { - if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { trace_xfs_bdstrat_shut(bp, _RET_IP_); /* * Metadata write that didn't get logged but @@ -1644,6 +1641,7 @@ out_error: xfs_buftarg_t * xfs_alloc_buftarg( + struct xfs_mount *mp, struct block_device *bdev, int external, const char *fsname) @@ -1652,6 +1650,7 @@ xfs_alloc_buftarg( btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; if (xfs_setsize_buftarg_early(btp, bdev)) diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 57eedc750ee6..def2cea2b4d6 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -132,6 +132,7 @@ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct address_space *bt_mapping; + struct xfs_mount *bt_mount; unsigned int bt_bsize; unsigned int bt_sshift; size_t bt_smask; @@ -189,7 +190,6 @@ typedef struct xfs_buf { struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; - struct xfs_mount *b_mount; unsigned short b_error; /* error code on I/O */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ @@ -377,7 +377,8 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) /* * Handling of buftargs. */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); +extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, + struct block_device *, int, const char *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 83154c0a3175..4759be4daa26 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -758,18 +758,20 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1, + mp->m_fsname); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1, + mp->m_fsname); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1b09d7a280df..ee7557611b6e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -692,8 +692,7 @@ xfs_buf_item_init( * the first. If we do already have one, there is * nothing to do here so return. */ - if (bp->b_mount != mp) - bp->b_mount = mp; + ASSERT(bp->b_target->bt_mount == mp); if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); if (lip->li_type == XFS_LI_BUF) { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3d887542b037..351d71117f16 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -322,10 +322,11 @@ xlog_recover_iodone( * this during recovery. One strike! */ xfs_ioerror_alert("xlog_recover_iodone", - bp->b_mount, bp, XFS_BUF_ADDR(bp)); - xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); + bp->b_target->bt_mount, bp, + XFS_BUF_ADDR(bp)); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); } - bp->b_mount = NULL; XFS_BUF_CLR_IODONE_FUNC(bp); xfs_biodone(bp); } @@ -2276,8 +2277,7 @@ xlog_recover_do_buffer_trans( XFS_BUF_STALE(bp); error = xfs_bwrite(mp, bp); } else { - ASSERT(bp->b_mount == NULL || bp->b_mount == mp); - bp->b_mount = mp; + ASSERT(bp->b_target->bt_mount == mp); XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); xfs_bdwrite(mp, bp); } @@ -2541,8 +2541,7 @@ xlog_recover_do_inode_trans( } write_inode_buffer: - ASSERT(bp->b_mount == NULL || bp->b_mount == mp); - bp->b_mount = mp; + ASSERT(bp->b_target->bt_mount == mp); XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); xfs_bdwrite(mp, bp); error: @@ -2679,8 +2678,7 @@ xlog_recover_do_dquot_trans( memcpy(ddq, recddq, item->ri_buf[1].i_len); ASSERT(dq_f->qlf_size == 2); - ASSERT(bp->b_mount == NULL || bp->b_mount == mp); - bp->b_mount = mp; + ASSERT(bp->b_target->bt_mount == mp); XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); xfs_bdwrite(mp, bp); From 26af655233dd486659235f3049959d2f7dafc5a1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 22 Sep 2010 10:47:20 +1000 Subject: [PATCH 14/36] xfs: kill XBF_FS_MANAGED buffers Filesystem level managed buffers are buffers that have their lifecycle controlled by the filesystem layer, not the buffer cache. We currently cache these buffers, which makes cleanup and cache walking somewhat troublesome. Convert the fs managed buffers to uncached buffers obtained by via xfs_buf_get_uncached(), and remove the XBF_FS_MANAGED special cases from the buffer cache. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 20 +++----------- fs/xfs/linux-2.6/xfs_buf.h | 4 --- fs/xfs/xfs_mount.c | 55 +++++++++++--------------------------- 3 files changed, 20 insertions(+), 59 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index d6928970097f..975d6589394a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -826,8 +826,6 @@ xfs_buf_rele( atomic_inc(&bp->b_hold); spin_unlock(&hash->bh_lock); (*(bp->b_relse)) (bp); - } else if (bp->b_flags & XBF_FS_MANAGED) { - spin_unlock(&hash->bh_lock); } else { ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); list_del_init(&bp->b_hash_list); @@ -1433,26 +1431,16 @@ void xfs_wait_buftarg( xfs_buftarg_t *btp) { - xfs_buf_t *bp, *n; xfs_bufhash_t *hash; uint i; for (i = 0; i < (1 << btp->bt_hashshift); i++) { hash = &btp->bt_hash[i]; -again: spin_lock(&hash->bh_lock); - list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { - ASSERT(btp == bp->b_target); - if (!(bp->b_flags & XBF_FS_MANAGED)) { - spin_unlock(&hash->bh_lock); - /* - * Catch superblock reference count leaks - * immediately - */ - BUG_ON(bp->b_bn == 0); - delay(100); - goto again; - } + while (!list_empty(&hash->bh_list)) { + spin_unlock(&hash->bh_lock); + delay(100); + spin_lock(&hash->bh_lock); } spin_unlock(&hash->bh_lock); } diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index def2cea2b4d6..1f109cee136c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -51,7 +51,6 @@ typedef enum { #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ #define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ -#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */ #define XBF_ORDERED (1 << 11)/* use ordered writes */ #define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ #define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ @@ -104,7 +103,6 @@ typedef unsigned int xfs_buf_flags_t; { XBF_DONE, "DONE" }, \ { XBF_DELWRI, "DELWRI" }, \ { XBF_STALE, "STALE" }, \ - { XBF_FS_MANAGED, "FS_MANAGED" }, \ { XBF_ORDERED, "ORDERED" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ { XBF_LOCK, "LOCK" }, /* should never be set */\ @@ -279,8 +277,6 @@ extern void xfs_buf_terminate(void); XFS_BUF_DONE(bp); \ } while (0) -#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) - #define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) #define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 14fc6e9e1816..fbca293326e5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -646,7 +646,6 @@ int xfs_readsb(xfs_mount_t *mp, int flags) { unsigned int sector_size; - unsigned int extra_flags; xfs_buf_t *bp; int error; @@ -659,28 +658,24 @@ xfs_readsb(xfs_mount_t *mp, int flags) * access to the superblock. */ sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); - extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED; - bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), - extra_flags); - if (!bp || XFS_BUF_ISERROR(bp)) { - xfs_fs_mount_cmn_err(flags, "SB read failed"); - error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; - goto fail; +reread: + bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + XFS_SB_DADDR, sector_size, 0); + if (!bp) { + xfs_fs_mount_cmn_err(flags, "SB buffer read failed"); + return EIO; } - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); /* * Initialize the mount structure from the superblock. * But first do some basic consistency checking. */ xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); - error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); if (error) { xfs_fs_mount_cmn_err(flags, "SB validate failed"); - goto fail; + goto release_buf; } /* @@ -691,7 +686,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) "device supports only %u byte sectors (not %u)", sector_size, mp->m_sb.sb_sectsize); error = ENOSYS; - goto fail; + goto release_buf; } /* @@ -699,33 +694,20 @@ xfs_readsb(xfs_mount_t *mp, int flags) * re-read the superblock so the buffer is correctly sized. */ if (sector_size < mp->m_sb.sb_sectsize) { - XFS_BUF_UNMANAGE(bp); xfs_buf_relse(bp); sector_size = mp->m_sb.sb_sectsize; - bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), extra_flags); - if (!bp || XFS_BUF_ISERROR(bp)) { - xfs_fs_mount_cmn_err(flags, "SB re-read failed"); - error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; - goto fail; - } - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + goto reread; } /* Initialize per-cpu counters */ xfs_icsb_reinit_counters(mp); mp->m_sb_bp = bp; - xfs_buf_relse(bp); - ASSERT(XFS_BUF_VALUSEMA(bp) > 0); + xfs_buf_unlock(bp); return 0; - fail: - if (bp) { - XFS_BUF_UNMANAGE(bp); - xfs_buf_relse(bp); - } +release_buf: + xfs_buf_relse(bp); return error; } @@ -2005,18 +1987,13 @@ xfs_getsb( */ void xfs_freesb( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_buf_t *bp; + struct xfs_buf *bp = mp->m_sb_bp; - /* - * Use xfs_getsb() so that the buffer will be locked - * when we call xfs_buf_relse(). - */ - bp = xfs_getsb(mp, 0); - XFS_BUF_UNMANAGE(bp); - xfs_buf_relse(bp); + xfs_buf_lock(bp); mp->m_sb_bp = NULL; + xfs_buf_relse(bp); } /* From 1922c949c59f93beb560d59874bcc6d5c00115ac Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 22 Sep 2010 10:47:20 +1000 Subject: [PATCH 15/36] xfs: use unhashed buffers for size checks When we are checking we can access the last block of each device, we do not need to use cached buffers as they will be tossed away immediately. Use uncached buffers for size checks so that all IO prior to full in-memory structure initialisation does not use the buffer cache. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_fsops.c | 11 +++++------ fs/xfs/xfs_mount.c | 39 ++++++++++++++++----------------------- fs/xfs/xfs_rtalloc.c | 29 +++++++++++++---------------- 3 files changed, 34 insertions(+), 45 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 43b1d5699335..6a1edb1348f6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -144,12 +144,11 @@ xfs_growfs_data_private( if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; - error = xfs_read_buf(mp, mp->m_ddev_targp, - XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp); - if (error) - return error; - ASSERT(bp); + bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), + BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + if (!bp) + return EIO; xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fbca293326e5..912101d280bf 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -980,42 +980,35 @@ xfs_check_sizes(xfs_mount_t *mp) { xfs_buf_t *bp; xfs_daddr_t d; - int error; d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { - cmn_err(CE_WARN, "XFS: size check 1 failed"); + cmn_err(CE_WARN, "XFS: filesystem size mismatch detected"); return XFS_ERROR(EFBIG); } - error = xfs_read_buf(mp, mp->m_ddev_targp, - d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp); - if (!error) { - xfs_buf_relse(bp); - } else { - cmn_err(CE_WARN, "XFS: size check 2 failed"); - if (error == ENOSPC) - error = XFS_ERROR(EFBIG); - return error; + bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + d - XFS_FSS_TO_BB(mp, 1), + BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + if (!bp) { + cmn_err(CE_WARN, "XFS: last sector read failed"); + return EIO; } + xfs_buf_relse(bp); if (mp->m_logdev_targp != mp->m_ddev_targp) { d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { - cmn_err(CE_WARN, "XFS: size check 3 failed"); + cmn_err(CE_WARN, "XFS: log size mismatch detected"); return XFS_ERROR(EFBIG); } - error = xfs_read_buf(mp, mp->m_logdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp); - if (!error) { - xfs_buf_relse(bp); - } else { - cmn_err(CE_WARN, "XFS: size check 3 failed"); - if (error == ENOSPC) - error = XFS_ERROR(EFBIG); - return error; + bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_B(mp, 1), 0); + if (!bp) { + cmn_err(CE_WARN, "XFS: log device read failed"); + return EIO; } + xfs_buf_relse(bp); } return 0; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 891260fea11e..12a191385310 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -39,6 +39,7 @@ #include "xfs_trans_space.h" #include "xfs_utils.h" #include "xfs_trace.h" +#include "xfs_buf.h" /* @@ -1883,13 +1884,13 @@ xfs_growfs_rt( /* * Read in the last block of the device, make sure it exists. */ - error = xfs_read_buf(mp, mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp); - if (error) - return error; - ASSERT(bp); + bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, nrblocks - 1), + XFS_FSB_TO_B(mp, 1), 0); + if (!bp) + return EIO; xfs_buf_relse(bp); + /* * Calculate new parameters. These are the final values to be reached. */ @@ -2215,7 +2216,6 @@ xfs_rtmount_init( { xfs_buf_t *bp; /* buffer for last block of subvolume */ xfs_daddr_t d; /* address of last block of subvolume */ - int error; /* error return value */ xfs_sb_t *sbp; /* filesystem superblock copy in mount */ sbp = &mp->m_sb; @@ -2242,15 +2242,12 @@ xfs_rtmount_init( (unsigned long long) mp->m_sb.sb_rblocks); return XFS_ERROR(EFBIG); } - error = xfs_read_buf(mp, mp->m_rtdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp); - if (error) { - cmn_err(CE_WARN, - "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); - if (error == ENOSPC) - return XFS_ERROR(EFBIG); - return error; + bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_B(mp, 1), 0); + if (!bp) { + cmn_err(CE_WARN, "XFS: realtime device size check failed"); + return EIO; } xfs_buf_relse(bp); return 0; From 69d6cc76cff3573ceefda178b75e20878866fdc3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 22 Sep 2010 10:47:20 +1000 Subject: [PATCH 16/36] xfs: remove buftarg hash for external devices For RT and external log devices, we never use hashed buffers on them now. Remove the buftarg hash tables that are set up for them. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 975d6589394a..251bcdc6352e 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1458,7 +1458,11 @@ xfs_alloc_bufhash( { unsigned int i; - btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ + if (external) { + btp->bt_hash = NULL; + return; + } + btp->bt_hashshift = 12; /* 4096 buckets */ btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); for (i = 0; i < (1 << btp->bt_hashshift); i++) { From 65d0f20533c503b50bd5e7e86434512af7761eea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 18:40:15 +1000 Subject: [PATCH 17/36] xfs: split inode AG walking into separate code for reclaim The reclaim walk requires different locking and has a slightly different walk algorithm, so separate it out so that it can be optimised separately. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 202 +++++++++++++++------------------ fs/xfs/linux-2.6/xfs_sync.h | 2 +- fs/xfs/linux-2.6/xfs_trace.h | 2 +- fs/xfs/quota/xfs_qm_syscalls.c | 3 +- fs/xfs/xfs_mount.c | 26 +++++ fs/xfs/xfs_mount.h | 2 + 6 files changed, 122 insertions(+), 115 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 3a1d229b4784..b5cdf0ef39ec 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -40,78 +40,46 @@ #include -STATIC xfs_inode_t * -xfs_inode_ag_lookup( - struct xfs_mount *mp, - struct xfs_perag *pag, - uint32_t *first_index, - int tag) -{ - int nr_found; - struct xfs_inode *ip; - - /* - * use a gang lookup to find the next inode in the tree - * as the tree is sparse and a gang lookup walks to find - * the number of objects requested. - */ - if (tag == XFS_ICI_NO_TAG) { - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)&ip, *first_index, 1); - } else { - nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, - (void **)&ip, *first_index, 1, tag); - } - if (!nr_found) - return NULL; - - /* - * Update the index for the next lookup. Catch overflows - * into the next AG range which can occur if we have inodes - * in the last block of the AG and we are currently - * pointing to the last inode. - */ - *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - return NULL; - return ip; -} - STATIC int xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, - int tag, - int exclusive, - int *nr_to_scan) + int flags) { uint32_t first_index; int last_error = 0; int skipped; + int done; restart: + done = 0; skipped = 0; first_index = 0; do { int error = 0; + int nr_found; xfs_inode_t *ip; - if (exclusive) - write_lock(&pag->pag_ici_lock); - else - read_lock(&pag->pag_ici_lock); - ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); - if (!ip) { - if (exclusive) - write_unlock(&pag->pag_ici_lock); - else - read_unlock(&pag->pag_ici_lock); + read_lock(&pag->pag_ici_lock); + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)&ip, first_index, 1); + if (!nr_found) { + read_unlock(&pag->pag_ici_lock); break; } + /* + * Update the index for the next lookup. Catch overflows + * into the next AG range which can occur if we have inodes + * in the last block of the AG and we are currently + * pointing to the last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + /* execute releases pag->pag_ici_lock */ error = execute(ip, pag, flags); if (error == EAGAIN) { @@ -125,7 +93,7 @@ restart: if (error == EFSCORRUPTED) break; - } while ((*nr_to_scan)--); + } while (!done); if (skipped) { delay(1); @@ -134,73 +102,29 @@ restart: return last_error; } -/* - * Select the next per-ag structure to iterate during the walk. The reclaim - * walk is optimised only to walk AGs with reclaimable inodes in them. - */ -static struct xfs_perag * -xfs_inode_ag_iter_next_pag( - struct xfs_mount *mp, - xfs_agnumber_t *first, - int tag) -{ - struct xfs_perag *pag = NULL; - - if (tag == XFS_ICI_RECLAIM_TAG) { - int found; - int ref; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, *first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - *first = pag->pag_agno + 1; - /* open coded pag reference increment */ - ref = atomic_inc_return(&pag->pag_ref); - rcu_read_unlock(); - trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_); - } else { - pag = xfs_perag_get(mp, *first); - (*first)++; - } - return pag; -} - int xfs_inode_ag_iterator( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, - int tag, - int exclusive, - int *nr_to_scan) + int flags) { struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t ag; - int nr; - nr = nr_to_scan ? *nr_to_scan : INT_MAX; ag = 0; - while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { - error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, - exclusive, &nr); + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags); xfs_perag_put(pag); if (error) { last_error = error; if (error == EFSCORRUPTED) break; } - if (nr <= 0) - break; } - if (nr_to_scan) - *nr_to_scan = nr; return XFS_ERROR(last_error); } @@ -318,8 +242,7 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); - error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG, 0, NULL); + error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); if (error) return XFS_ERROR(error); @@ -337,8 +260,7 @@ xfs_sync_attr( { ASSERT((flags & ~SYNC_WAIT) == 0); - return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG, 0, NULL); + return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); } STATIC int @@ -868,13 +790,72 @@ reclaim: } +/* + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is + * corrupted, we still want to try to reclaim all the inodes. If we don't, + * then a shut down during filesystem unmount reclaim walk leak all the + * unreclaimed inodes. + */ +int +xfs_reclaim_inodes_ag( + struct xfs_mount *mp, + int flags, + int *nr_to_scan) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + unsigned long first_index = 0; + int done = 0; + + ag = pag->pag_agno + 1; + + do { + struct xfs_inode *ip; + int nr_found; + + write_lock(&pag->pag_ici_lock); + nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, + (void **)&ip, first_index, 1, + XFS_ICI_RECLAIM_TAG); + if (!nr_found) { + write_unlock(&pag->pag_ici_lock); + break; + } + + /* + * Update the index for the next lookup. Catch overflows + * into the next AG range which can occur if we have inodes + * in the last block of the AG and we are currently + * pointing to the last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + + error = xfs_reclaim_inode(ip, pag, flags); + if (error && last_error != EFSCORRUPTED) + last_error = error; + + } while (!done && (*nr_to_scan)--); + + xfs_perag_put(pag); + } + return XFS_ERROR(last_error); +} + int xfs_reclaim_inodes( xfs_mount_t *mp, int mode) { - return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, - XFS_ICI_RECLAIM_TAG, 1, NULL); + int nr_to_scan = INT_MAX; + + return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); } /* @@ -896,17 +877,16 @@ xfs_reclaim_inode_shrink( if (!(gfp_mask & __GFP_FS)) return -1; - xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, - XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); - /* if we don't exhaust the scan, don't bother coming back */ + xfs_reclaim_inodes_ag(mp, 0, &nr_to_scan); + /* terminate if we don't exhaust the scan */ if (nr_to_scan > 0) return -1; } reclaimable = 0; ag = 0; - while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, - XFS_ICI_RECLAIM_TAG))) { + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + ag = pag->pag_agno + 1; reclaimable += pag->pag_ici_reclaimable; xfs_perag_put(pag); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index fe78726196f8..e8a352896d20 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -50,7 +50,7 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag, int write_lock, int *nr_to_scan); + int flags); void xfs_inode_shrinker_register(struct xfs_mount *mp); void xfs_inode_shrinker_unregister(struct xfs_mount *mp); diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 2a1d4fbd9ed8..286dc201c5b9 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \ unsigned long caller_ip), \ TP_ARGS(mp, agno, refcount, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); -DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 7a71336f7922..ac11fbef37fc 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -918,8 +918,7 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, - XFS_ICI_NO_TAG, 0, NULL); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); } /*------------------------------------------------------------------------*/ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 912101d280bf..d66e87c7c3a6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -219,6 +219,32 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) return pag; } +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + void xfs_perag_put(struct xfs_perag *pag) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 622da2179a57..7ab240930ba5 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -327,6 +327,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) * perag get/put wrappers for ref counting */ struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); +struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, + int tag); void xfs_perag_put(struct xfs_perag *pag); /* From e13de955ca67b0bd1cec9a2f9352a3053065bf7f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 28 Sep 2010 12:28:06 +1000 Subject: [PATCH 18/36] xfs: split out inode walk inode grabbing When doing read side inode cache walks, the code to validate and grab an inode is common to all callers. Split it out of the execute callbacks in preparation for batching lookups. Similarly, split out the inode reference dropping from the execute callbacks into the main lookup look to be symmetric with the grab. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 79 +++++++++++++++------------------- fs/xfs/quota/xfs_qm_syscalls.c | 9 ---- 2 files changed, 34 insertions(+), 54 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index b5cdf0ef39ec..37fc2c0a4d25 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -39,6 +39,33 @@ #include #include +STATIC int +xfs_inode_ag_walk_grab( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EFSCORRUPTED; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + return ENOENT; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return ENOENT; + + if (is_bad_inode(inode)) { + IRELE(ip); + return ENOENT; + } + + /* inode is valid */ + return 0; +} + STATIC int xfs_inode_ag_walk( @@ -80,8 +107,14 @@ restart: if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) done = 1; - /* execute releases pag->pag_ici_lock */ + if (xfs_inode_ag_walk_grab(ip)) { + read_unlock(&pag->pag_ici_lock); + continue; + } + read_unlock(&pag->pag_ici_lock); + error = execute(ip, pag, flags); + IRELE(ip); if (error == EAGAIN) { skipped++; continue; @@ -128,40 +161,6 @@ xfs_inode_ag_iterator( return XFS_ERROR(last_error); } -/* must be called with pag_ici_lock held and releases it */ -int -xfs_sync_inode_valid( - struct xfs_inode *ip, - struct xfs_perag *pag) -{ - struct inode *inode = VFS_I(ip); - int error = EFSCORRUPTED; - - /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - goto out_unlock; - - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - error = ENOENT; - if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) - goto out_unlock; - - /* If we can't grab the inode, it must on it's way to reclaim. */ - if (!igrab(inode)) - goto out_unlock; - - if (is_bad_inode(inode)) { - IRELE(ip); - goto out_unlock; - } - - /* inode is valid */ - error = 0; -out_unlock: - read_unlock(&pag->pag_ici_lock); - return error; -} - STATIC int xfs_sync_inode_data( struct xfs_inode *ip, @@ -172,10 +171,6 @@ xfs_sync_inode_data( struct address_space *mapping = inode->i_mapping; int error = 0; - error = xfs_sync_inode_valid(ip, pag); - if (error) - return error; - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_wait; @@ -192,7 +187,6 @@ xfs_sync_inode_data( out_wait: if (flags & SYNC_WAIT) xfs_ioend_wait(ip); - IRELE(ip); return error; } @@ -204,10 +198,6 @@ xfs_sync_inode_attr( { int error = 0; - error = xfs_sync_inode_valid(ip, pag); - if (error) - return error; - xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_inode_clean(ip)) goto out_unlock; @@ -226,7 +216,6 @@ xfs_sync_inode_attr( out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); - IRELE(ip); return error; } diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index ac11fbef37fc..57847434fa51 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -875,21 +875,14 @@ xfs_dqrele_inode( struct xfs_perag *pag, int flags) { - int error; - /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || ip == ip->i_mount->m_quotainfo->qi_gquotaip) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); - read_unlock(&pag->pag_ici_lock); return 0; } - error = xfs_sync_inode_valid(ip, pag); - if (error) - return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); @@ -900,8 +893,6 @@ xfs_dqrele_inode( ip->i_gdquot = NULL; } xfs_iunlock(ip, XFS_ILOCK_EXCL); - - IRELE(ip); return 0; } From 78ae5256768b91f25ce7a4eb9f56d563e302cc10 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 28 Sep 2010 12:28:19 +1000 Subject: [PATCH 19/36] xfs: implement batched inode lookups for AG walking With the reclaim code separated from the generic walking code, it is simple to implement batched lookups for the generic walk code. Separate out the inode validation from the execute operations and modify the tree lookups to get a batch of inodes at a time. Reclaim operations will be optimised separately. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 66 ++++++++++++++++++++++++------------- fs/xfs/linux-2.6/xfs_sync.h | 2 +- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 37fc2c0a4d25..0ed3d0ae3c28 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -39,6 +39,14 @@ #include #include +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + STATIC int xfs_inode_ag_walk_grab( struct xfs_inode *ip) @@ -66,7 +74,6 @@ xfs_inode_ag_walk_grab( return 0; } - STATIC int xfs_inode_ag_walk( struct xfs_mount *mp, @@ -79,54 +86,69 @@ xfs_inode_ag_walk( int last_error = 0; int skipped; int done; + int nr_found; restart: done = 0; skipped = 0; first_index = 0; + nr_found = 0; do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; int error = 0; - int nr_found; - xfs_inode_t *ip; + int i; read_lock(&pag->pag_ici_lock); nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)&ip, first_index, 1); + (void **)batch, first_index, + XFS_LOOKUP_BATCH); if (!nr_found) { read_unlock(&pag->pag_ici_lock); break; } /* - * Update the index for the next lookup. Catch overflows - * into the next AG range which can occur if we have inodes - * in the last block of the AG and we are currently - * pointing to the last inode. + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. */ - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; - if (xfs_inode_ag_walk_grab(ip)) { - read_unlock(&pag->pag_ici_lock); - continue; + if (done || xfs_inode_ag_walk_grab(ip)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch overflows + * into the next AG range which can occur if we have inodes + * in the last block of the AG and we are currently + * pointing to the last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; } + + /* unlock now we've grabbed the inodes. */ read_unlock(&pag->pag_ici_lock); - error = execute(ip, pag, flags); - IRELE(ip); - if (error == EAGAIN) { - skipped++; - continue; + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = execute(batch[i], pag, flags); + IRELE(batch[i]); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; } - if (error) - last_error = error; /* bail out if the filesystem is corrupted. */ if (error == EFSCORRUPTED) break; - } while (!done); + } while (nr_found && !done); if (skipped) { delay(1); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index e8a352896d20..32ba6628290c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -47,7 +47,7 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); -int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); +int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags); From e3a20c0b02e1704ab115dfa9d012caf0fbc45ed0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 19:51:50 +1000 Subject: [PATCH 20/36] xfs: batch inode reclaim lookup Batch and optimise the per-ag inode lookup for reclaim to minimise scanning overhead. This involves gang lookups on the radix trees to get multiple inodes during each tree walk, and tighter validation of what inodes can be reclaimed without blocking befor we take any locks. This is based on ideas suggested in a proof-of-concept patch posted by Nick Piggin. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 110 +++++++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 33 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 0ed3d0ae3c28..754bc591a247 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -630,6 +630,43 @@ __xfs_inode_clear_reclaim_tag( __xfs_inode_clear_reclaim(pag, ip); } +/* + * Grab the inode for reclaim exclusively. + * Return 0 if we grabbed it, non-zero otherwise. + */ +STATIC int +xfs_reclaim_inode_grab( + struct xfs_inode *ip, + int flags) +{ + + /* + * do some unlocked checks first to avoid unnecceary lock traffic. + * The first is a flush lock check, the second is a already in reclaim + * check. Only do these checks if we are not going to block on locks. + */ + if ((flags & SYNC_TRYLOCK) && + (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { + return 1; + } + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + */ + spin_lock(&ip->i_flags_lock); + ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* ignore as it is already under reclaim */ + spin_unlock(&ip->i_flags_lock); + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + return 0; +} + /* * Inodes in different states need to be treated differently, and the return * value of xfs_iflush is not sufficient to get this right. The following table @@ -688,23 +725,6 @@ xfs_reclaim_inode( { int error = 0; - /* - * The radix tree lock here protects a thread in xfs_iget from racing - * with us starting reclaim on the inode. Once we have the - * XFS_IRECLAIM flag set it will not touch us. - */ - spin_lock(&ip->i_flags_lock); - ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { - /* ignore as it is already under reclaim */ - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - return 0; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) @@ -822,16 +842,19 @@ xfs_reclaim_inodes_ag( while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { unsigned long first_index = 0; int done = 0; + int nr_found = 0; ag = pag->pag_agno + 1; do { - struct xfs_inode *ip; - int nr_found; + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int i; write_lock(&pag->pag_ici_lock); - nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, - (void **)&ip, first_index, 1, + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH, XFS_ICI_RECLAIM_TAG); if (!nr_found) { write_unlock(&pag->pag_ici_lock); @@ -839,20 +862,41 @@ xfs_reclaim_inodes_ag( } /* - * Update the index for the next lookup. Catch overflows - * into the next AG range which can occur if we have inodes - * in the last block of the AG and we are currently - * pointing to the last inode. + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. */ - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; - error = xfs_reclaim_inode(ip, pag, flags); - if (error && last_error != EFSCORRUPTED) - last_error = error; + if (done || xfs_reclaim_inode_grab(ip, flags)) + batch[i] = NULL; - } while (!done && (*nr_to_scan)--); + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can + * occur if we have inodes in the last block of + * the AG and we are currently pointing to the + * last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + write_unlock(&pag->pag_ici_lock); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_reclaim_inode(batch[i], pag, flags); + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + *nr_to_scan -= XFS_LOOKUP_BATCH; + + } while (nr_found && !done && *nr_to_scan > 0); xfs_perag_put(pag); } @@ -888,7 +932,7 @@ xfs_reclaim_inode_shrink( if (!(gfp_mask & __GFP_FS)) return -1; - xfs_reclaim_inodes_ag(mp, 0, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); /* terminate if we don't exhaust the scan */ if (nr_to_scan > 0) return -1; From 69b491c214d7fd4d4df972ae5377be99ca3753db Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 Sep 2010 11:09:51 +1000 Subject: [PATCH 21/36] xfs: serialise inode reclaim within an AG Memory reclaim via shrinkers has a terrible habit of having N+M concurrent shrinker executions (N = num CPUs, M = num kswapds) all trying to shrink the same cache. When the cache they are all working on is protected by a single spinlock, massive contention an slowdowns occur. Wrap the per-ag inode caches with a reclaim mutex to serialise reclaim access to the AG. This will block concurrent reclaim in each AG but still allow reclaim to scan multiple AGs concurrently. Allow shrinkers to move on to the next AG if it can't get the lock, and if we can't get any AG, then start blocking on locks. To prevent reclaimers from continually scanning the same inodes in each AG, add a cursor that tracks where the last reclaim got up to and start from that point on the next reclaim. This should avoid only ever scanning a small number of inodes at the satart of each AG and not making progress. If we have a non-shrinker based reclaim pass, ignore the cursor and reset it to zero once we are done. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 30 ++++++++++++++++++++++++++++++ fs/xfs/xfs_ag.h | 2 ++ fs/xfs/xfs_mount.c | 1 + 3 files changed, 33 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 754bc591a247..37d33254981d 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -837,8 +837,12 @@ xfs_reclaim_inodes_ag( int error = 0; int last_error = 0; xfs_agnumber_t ag; + int trylock = flags & SYNC_TRYLOCK; + int skipped; +restart: ag = 0; + skipped = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { unsigned long first_index = 0; int done = 0; @@ -846,6 +850,15 @@ xfs_reclaim_inodes_ag( ag = pag->pag_agno + 1; + if (trylock) { + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { + skipped++; + continue; + } + first_index = pag->pag_ici_reclaim_cursor; + } else + mutex_lock(&pag->pag_ici_reclaim_lock); + do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; int i; @@ -898,8 +911,25 @@ xfs_reclaim_inodes_ag( } while (nr_found && !done && *nr_to_scan > 0); + if (trylock && !done) + pag->pag_ici_reclaim_cursor = first_index; + else + pag->pag_ici_reclaim_cursor = 0; + mutex_unlock(&pag->pag_ici_reclaim_lock); xfs_perag_put(pag); } + + /* + * if we skipped any AG, and we still have scan count remaining, do + * another pass this time using blocking reclaim semantics (i.e + * waiting on the reclaim locks and ignoring the reclaim cursors). This + * ensure that when we get more reclaimers than AGs we block rather + * than spin trying to execute reclaim. + */ + if (trylock && skipped && *nr_to_scan > 0) { + trylock = 0; + goto restart; + } return XFS_ERROR(last_error); } diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 51c42c202bf1..baeec83d01f9 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -230,6 +230,8 @@ typedef struct xfs_perag { rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ int pag_ici_reclaimable; /* reclaimable inodes */ + struct mutex pag_ici_reclaim_lock; /* serialisation point */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* for rcu-safe freeing */ struct rcu_head rcu_head; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d66e87c7c3a6..59859c343e04 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -477,6 +477,7 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; rwlock_init(&pag->pag_ici_lock); + mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); if (radix_tree_preload(GFP_NOFS)) From 74f75a0cb7033918eb0fa4a50df25091ac75c16e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 19:59:04 +1000 Subject: [PATCH 22/36] xfs: convert buffer cache hash to rbtree The buffer cache hash is showing typical hash scalability problems. In large scale testing the number of cached items growing far larger than the hash can efficiently handle. Hence we need to move to a self-scaling cache indexing mechanism. I have selected rbtrees for indexing becuse they can have O(log n) search scalability, and insert and remove cost is not excessive, even on large trees. Hence we should be able to cache large numbers of buffers without incurring the excessive cache miss search penalties that the hash is imposing on us. To ensure we still have parallel access to the cache, we need multiple trees. Rather than hashing the buffers by disk address to select a tree, it seems more sensible to separate trees by typical access patterns. Most operations use buffers from within a single AG at a time, so rather than searching lots of different lists, separate the buffer indexes out into per-AG rbtrees. This means that searches during metadata operation have a much higher chance of hitting cache resident nodes, and that updates of the tree are less likely to disturb trees being accessed on other CPUs doing independent operations. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 132 ++++++++++++++++++------------------- fs/xfs/linux-2.6/xfs_buf.h | 8 +-- fs/xfs/xfs_ag.h | 4 ++ fs/xfs/xfs_mount.c | 2 + 4 files changed, 72 insertions(+), 74 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 251bcdc6352e..749d7d39d657 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -188,7 +188,7 @@ _xfs_buf_initialize( atomic_set(&bp->b_hold, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_list); - INIT_LIST_HEAD(&bp->b_hash_list); + RB_CLEAR_NODE(&bp->b_rbnode); init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; @@ -262,8 +262,6 @@ xfs_buf_free( { trace_xfs_buf_free(bp, _RET_IP_); - ASSERT(list_empty(&bp->b_hash_list)); - if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; @@ -422,8 +420,10 @@ _xfs_buf_find( { xfs_off_t range_base; size_t range_length; - xfs_bufhash_t *hash; - xfs_buf_t *bp, *n; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + xfs_buf_t *bp; range_base = (ioff << BBSHIFT); range_length = (isize << BBSHIFT); @@ -432,14 +432,37 @@ _xfs_buf_find( ASSERT(!(range_length < (1 << btp->bt_sshift))); ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); - hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; + /* get tree root */ + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, ioff)); - spin_lock(&hash->bh_lock); + /* walk tree */ + spin_lock(&pag->pag_buf_lock); + rbp = &pag->pag_buf_tree.rb_node; + parent = NULL; + bp = NULL; + while (*rbp) { + parent = *rbp; + bp = rb_entry(parent, struct xfs_buf, b_rbnode); - list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { - ASSERT(btp == bp->b_target); - if (bp->b_file_offset == range_base && - bp->b_buffer_length == range_length) { + if (range_base < bp->b_file_offset) + rbp = &(*rbp)->rb_left; + else if (range_base > bp->b_file_offset) + rbp = &(*rbp)->rb_right; + else { + /* + * found a block offset match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching to the right for an exact match. + */ + if (bp->b_buffer_length != range_length) { + ASSERT(bp->b_flags & XBF_STALE); + rbp = &(*rbp)->rb_right; + continue; + } atomic_inc(&bp->b_hold); goto found; } @@ -449,17 +472,21 @@ _xfs_buf_find( if (new_bp) { _xfs_buf_initialize(new_bp, btp, range_base, range_length, flags); - new_bp->b_hash = hash; - list_add(&new_bp->b_hash_list, &hash->bh_list); + rb_link_node(&new_bp->b_rbnode, parent, rbp); + rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); } else { XFS_STATS_INC(xb_miss_locked); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); } - - spin_unlock(&hash->bh_lock); return new_bp; found: - spin_unlock(&hash->bh_lock); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); /* Attempt to get the semaphore without sleeping, * if this does not work then we need to drop the @@ -809,27 +836,30 @@ void xfs_buf_rele( xfs_buf_t *bp) { - xfs_bufhash_t *hash = bp->b_hash; + struct xfs_perag *pag = bp->b_pag; trace_xfs_buf_rele(bp, _RET_IP_); - if (unlikely(!hash)) { + if (!pag) { ASSERT(!bp->b_relse); + ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) xfs_buf_free(bp); return; } + ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { + if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { if (bp->b_relse) { atomic_inc(&bp->b_hold); - spin_unlock(&hash->bh_lock); - (*(bp->b_relse)) (bp); + spin_unlock(&pag->pag_buf_lock); + bp->b_relse(bp); } else { ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); - list_del_init(&bp->b_hash_list); - spin_unlock(&hash->bh_lock); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); xfs_buf_free(bp); } } @@ -1429,56 +1459,24 @@ xfs_buf_iomove( */ void xfs_wait_buftarg( - xfs_buftarg_t *btp) + struct xfs_buftarg *btp) { - xfs_bufhash_t *hash; - uint i; + struct xfs_perag *pag; + uint i; - for (i = 0; i < (1 << btp->bt_hashshift); i++) { - hash = &btp->bt_hash[i]; - spin_lock(&hash->bh_lock); - while (!list_empty(&hash->bh_list)) { - spin_unlock(&hash->bh_lock); + for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { + pag = xfs_perag_get(btp->bt_mount, i); + spin_lock(&pag->pag_buf_lock); + while (rb_first(&pag->pag_buf_tree)) { + spin_unlock(&pag->pag_buf_lock); delay(100); - spin_lock(&hash->bh_lock); + spin_lock(&pag->pag_buf_lock); } - spin_unlock(&hash->bh_lock); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); } } -/* - * Allocate buffer hash table for a given target. - * For devices containing metadata (i.e. not the log/realtime devices) - * we need to allocate a much larger hash table. - */ -STATIC void -xfs_alloc_bufhash( - xfs_buftarg_t *btp, - int external) -{ - unsigned int i; - - if (external) { - btp->bt_hash = NULL; - return; - } - btp->bt_hashshift = 12; /* 4096 buckets */ - btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * - sizeof(xfs_bufhash_t)); - for (i = 0; i < (1 << btp->bt_hashshift); i++) { - spin_lock_init(&btp->bt_hash[i].bh_lock); - INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); - } -} - -STATIC void -xfs_free_bufhash( - xfs_buftarg_t *btp) -{ - kmem_free_large(btp->bt_hash); - btp->bt_hash = NULL; -} - /* * buftarg list for delwrite queue processing */ @@ -1511,7 +1509,6 @@ xfs_free_buftarg( xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - xfs_free_bufhash(btp); iput(btp->bt_mapping->host); /* Unregister the buftarg first so that we don't get a @@ -1651,7 +1648,6 @@ xfs_alloc_buftarg( goto error; if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; - xfs_alloc_bufhash(btp, external); return btp; error: diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 1f109cee136c..c79882ecb513 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -135,10 +135,6 @@ typedef struct xfs_buftarg { unsigned int bt_sshift; size_t bt_smask; - /* per device buffer hash table */ - uint bt_hashshift; - xfs_bufhash_t *bt_hash; - /* per device delwri queue */ struct task_struct *bt_task; struct list_head bt_list; @@ -172,8 +168,8 @@ typedef struct xfs_buf { wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; xfs_buf_flags_t b_flags; /* status flags */ - struct list_head b_hash_list; /* hash table list */ - xfs_bufhash_t *b_hash; /* hash table list start */ + struct rb_node b_rbnode; /* rbtree node */ + struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ atomic_t b_hold; /* reference count */ xfs_daddr_t b_bn; /* block number for I/O */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index baeec83d01f9..63c7a1a6c022 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -233,6 +233,10 @@ typedef struct xfs_perag { struct mutex pag_ici_reclaim_lock; /* serialisation point */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ + struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + /* for rcu-safe freeing */ struct rcu_head rcu_head; #endif diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 59859c343e04..cfa2fb4e7f97 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -479,6 +479,8 @@ xfs_initialize_perag( rwlock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + spin_lock_init(&pag->pag_buf_lock); + pag->pag_buf_tree = RB_ROOT; if (radix_tree_preload(GFP_NOFS)) goto out_unwind; From 50f59e8eed85ec4c79bc2454ed50c7886f6c5ebf Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 24 Sep 2010 19:59:15 +1000 Subject: [PATCH 23/36] xfs: pack xfs_buf structure more tightly pahole reports the struct xfs_buf has quite a few holes in it, so packing the structure better will reduce the size of it by 16 bytes. Also, move all the fields used in cache lookups into the first cacheline. Before on x86_64: /* size: 320, cachelines: 5 */ /* sum members: 298, holes: 6, sum holes: 22 */ After on x86_64: /* size: 304, cachelines: 5 */ /* padding: 6 */ /* last cacheline: 48 bytes */ Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index c79882ecb513..161333785f69 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -162,33 +162,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); #define XB_PAGES 2 typedef struct xfs_buf { - struct semaphore b_sema; /* semaphore for lockables */ - unsigned long b_queuetime; /* time buffer was queued */ - atomic_t b_pin_count; /* pin count */ - wait_queue_head_t b_waiters; /* unpin waiters */ - struct list_head b_list; - xfs_buf_flags_t b_flags; /* status flags */ + /* + * first cacheline holds all the fields needed for an uncontended cache + * hit to be fully processed. The semaphore straddles the cacheline + * boundary, but the counter and lock sits on the first cacheline, + * which is the only bit that is touched if we hit the semaphore + * fast-path on locking. + */ struct rb_node b_rbnode; /* rbtree node */ - struct xfs_perag *b_pag; /* contains rbtree root */ - xfs_buftarg_t *b_target; /* buffer target (device) */ - atomic_t b_hold; /* reference count */ - xfs_daddr_t b_bn; /* block number for I/O */ xfs_off_t b_file_offset; /* offset in file */ size_t b_buffer_length;/* size of buffer in bytes */ + atomic_t b_hold; /* reference count */ + xfs_buf_flags_t b_flags; /* status flags */ + struct semaphore b_sema; /* semaphore for lockables */ + + wait_queue_head_t b_waiters; /* unpin waiters */ + struct list_head b_list; + struct xfs_perag *b_pag; /* contains rbtree root */ + xfs_buftarg_t *b_target; /* buffer target (device) */ + xfs_daddr_t b_bn; /* block number for I/O */ size_t b_count_desired;/* desired transfer size */ void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; - atomic_t b_io_remaining; /* #outstanding I/O requests */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ xfs_buf_relse_t b_relse; /* releasing function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; - unsigned short b_error; /* error code on I/O */ - unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset in first page */ struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ + unsigned long b_queuetime; /* time buffer was queued */ + atomic_t b_pin_count; /* pin count */ + atomic_t b_io_remaining; /* #outstanding I/O requests */ + unsigned int b_page_count; /* size of page array */ + unsigned int b_offset; /* page offset in first page */ + unsigned short b_error; /* error code on I/O */ #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif From 61ba35dea0593fbc8d062cab3e4c4c3da5ce7104 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Sep 2010 02:25:54 +0000 Subject: [PATCH 24/36] xfs: remove XFS_MOUNT_NO_PERCPU_SB Fail the mount if we can't allocate memory for the per-CPU counters. This is consistent with how we handle everything else in the mount path and makes the superblock counter modification a lot simpler. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_super.c | 6 ++++-- fs/xfs/xfs_mount.c | 40 ++++++++++++++---------------------- fs/xfs/xfs_mount.h | 2 -- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 4759be4daa26..a316c7316702 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1518,8 +1518,9 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - if (xfs_icsb_init_counters(mp)) - mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; + error = xfs_icsb_init_counters(mp); + if (error) + goto out_close_devices; error = xfs_readsb(mp, flags); if (error) @@ -1580,6 +1581,7 @@ xfs_fs_fill_super( xfs_freesb(mp); out_destroy_counters: xfs_icsb_destroy_counters(mp); + out_close_devices: xfs_close_devices(mp); out_free_fsname: xfs_free_fsname(mp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index cfa2fb4e7f97..55de83585e00 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1856,12 +1856,8 @@ xfs_mod_incore_sb( case XFS_SBS_ICOUNT: case XFS_SBS_IFREE: case XFS_SBS_FDBLOCKS: - if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - status = xfs_icsb_modify_counters(mp, field, - delta, rsvd); - break; - } - /* FALLTHROUGH */ + status = xfs_icsb_modify_counters(mp, field, delta, rsvd); + break; #endif default: spin_lock(&mp->m_sb_lock); @@ -1910,15 +1906,12 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) case XFS_SBS_ICOUNT: case XFS_SBS_IFREE: case XFS_SBS_FDBLOCKS: - if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - spin_unlock(&mp->m_sb_lock); - status = xfs_icsb_modify_counters(mp, - msbp->msb_field, - msbp->msb_delta, rsvd); - spin_lock(&mp->m_sb_lock); - break; - } - /* FALLTHROUGH */ + spin_unlock(&mp->m_sb_lock); + status = xfs_icsb_modify_counters(mp, + msbp->msb_field, + msbp->msb_delta, rsvd); + spin_lock(&mp->m_sb_lock); + break; #endif default: status = xfs_mod_incore_sb_unlocked(mp, @@ -1948,16 +1941,13 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) case XFS_SBS_ICOUNT: case XFS_SBS_IFREE: case XFS_SBS_FDBLOCKS: - if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - spin_unlock(&mp->m_sb_lock); - status = xfs_icsb_modify_counters(mp, - msbp->msb_field, - -(msbp->msb_delta), - rsvd); - spin_lock(&mp->m_sb_lock); - break; - } - /* FALLTHROUGH */ + spin_unlock(&mp->m_sb_lock); + status = xfs_icsb_modify_counters(mp, + msbp->msb_field, + -(msbp->msb_delta), + rsvd); + spin_lock(&mp->m_sb_lock); + break; #endif default: status = xfs_mod_incore_sb_unlocked(mp, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7ab240930ba5..a9d366e7656a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -232,8 +232,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ #define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred * I/O size in stat() */ -#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock - counters */ #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ From 96540c78583a417113df4d027e6b68a595ab9a09 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Sep 2010 02:25:55 +0000 Subject: [PATCH 25/36] xfs: do not use xfs_mod_incore_sb for per-cpu counters Export xfs_icsb_modify_counters and always use it for modifying the per-cpu counters. Remove support for per-cpu counters from xfs_mod_incore_sb to simplify it. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_bmap.c | 32 ++++++++++++++++++-------------- fs/xfs/xfs_fsops.c | 3 ++- fs/xfs/xfs_mount.c | 32 +++++++++----------------------- fs/xfs/xfs_mount.h | 4 ++++ fs/xfs/xfs_trans.c | 4 ++-- 5 files changed, 35 insertions(+), 40 deletions(-) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5e33b7862d41..8abd12e32e13 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -614,7 +614,7 @@ xfs_bmap_add_extent( nblks += cur->bc_private.b.allocated; ASSERT(nblks <= da_old); if (nblks < da_old) - xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, + xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, (int64_t)(da_old - nblks), rsvd); } /* @@ -1079,7 +1079,8 @@ xfs_bmap_add_extent_delay_real( diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); if (diff > 0 && - xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { + xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, + -((int64_t)diff), rsvd)) { /* * Ick gross gag me with a spoon. */ @@ -1089,16 +1090,18 @@ xfs_bmap_add_extent_delay_real( temp--; diff--; if (!diff || - !xfs_mod_incore_sb(ip->i_mount, - XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) + !xfs_icsb_modify_counters(ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), rsvd)) break; } if (temp2) { temp2--; diff--; if (!diff || - !xfs_mod_incore_sb(ip->i_mount, - XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) + !xfs_icsb_modify_counters(ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), rsvd)) break; } } @@ -1766,7 +1769,7 @@ xfs_bmap_add_extent_hole_delay( } if (oldlen != newlen) { ASSERT(oldlen > newlen); - xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, + xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, (int64_t)(oldlen - newlen), rsvd); /* * Nothing to do for disk quota accounting here. @@ -3111,9 +3114,10 @@ xfs_bmap_del_extent( * Nothing to do for disk quota accounting here. */ ASSERT(da_old >= da_new); - if (da_old > da_new) - xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), - rsvd); + if (da_old > da_new) { + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - da_new), rsvd); + } done: *logflagsp = flags; return error; @@ -4526,13 +4530,13 @@ xfs_bmapi( -((int64_t)extsz), (flags & XFS_BMAPI_RSVBLOCKS)); } else { - error = xfs_mod_incore_sb(mp, + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -((int64_t)alen), (flags & XFS_BMAPI_RSVBLOCKS)); } if (!error) { - error = xfs_mod_incore_sb(mp, + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -((int64_t)indlen), (flags & XFS_BMAPI_RSVBLOCKS)); @@ -4542,7 +4546,7 @@ xfs_bmapi( (int64_t)extsz, (flags & XFS_BMAPI_RSVBLOCKS)); else if (error) - xfs_mod_incore_sb(mp, + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, (int64_t)alen, (flags & XFS_BMAPI_RSVBLOCKS)); @@ -5206,7 +5210,7 @@ xfs_bunmapi( ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { - xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, (int64_t)del.br_blockcount, rsvd); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 6a1edb1348f6..a7c116e814af 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -596,7 +596,8 @@ out: * the extra reserve blocks from the reserve..... */ int error; - error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0); + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + fdblks_delta, 0); if (error == ENOSPC) goto retry; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 55de83585e00..20c67ff57536 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -52,16 +52,11 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, int); -STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, - int64_t, int); STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); - #else #define xfs_icsb_balance_counter(mp, a, b) do { } while (0) #define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) -#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0) - #endif static const struct { @@ -1843,28 +1838,19 @@ xfs_mod_incore_sb_unlocked( */ int xfs_mod_incore_sb( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) + struct xfs_mount *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) { - int status; + int status; - /* check for per-cpu counters */ - switch (field) { #ifdef HAVE_PERCPU_SB - case XFS_SBS_ICOUNT: - case XFS_SBS_IFREE: - case XFS_SBS_FDBLOCKS: - status = xfs_icsb_modify_counters(mp, field, delta, rsvd); - break; + ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS); #endif - default: - spin_lock(&mp->m_sb_lock); - status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); - break; - } + spin_lock(&mp->m_sb_lock); + status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + spin_unlock(&mp->m_sb_lock); return status; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a9d366e7656a..6509c0743957 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -91,6 +91,8 @@ extern void xfs_icsb_reinit_counters(struct xfs_mount *); extern void xfs_icsb_destroy_counters(struct xfs_mount *); extern void xfs_icsb_sync_counters(struct xfs_mount *, int); extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); +extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, + int64_t, int); #else #define xfs_icsb_init_counters(mp) (0) @@ -98,6 +100,8 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); #define xfs_icsb_reinit_counters(mp) do { } while (0) #define xfs_icsb_sync_counters(mp, flags) do { } while (0) #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) +#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \ + xfs_mod_incore_sb(mp, field, delta, rsvd) #endif typedef struct xfs_mount { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 1c47edaea0d2..73cf8f45725e 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -696,7 +696,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, + error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); @@ -767,7 +767,7 @@ undo_log: undo_blocks: if (blocks > 0) { - (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, + xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } From 1b0407125f9a5be63e861eb27c8af9e32f20619c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 30 Sep 2010 02:25:56 +0000 Subject: [PATCH 26/36] xfs: do not use xfs_mod_incore_sb_batch for per-cpu counters Update the per-cpu counters manually in xfs_trans_unreserve_and_mod_sb and remove support for per-cpu counters from xfs_mod_incore_sb_batch to simplify it. And added benefit is that we don't have to take m_sb_lock for transactions that only modify per-cpu counters. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 118 ++++++++++++++------------------------------- fs/xfs/xfs_trans.c | 78 +++++++++++++++++++----------- 2 files changed, 87 insertions(+), 109 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 20c67ff57536..00538b7e694a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1856,98 +1856,54 @@ xfs_mod_incore_sb( } /* - * xfs_mod_incore_sb_batch() is used to change more than one field - * in the in-core superblock structure at a time. This modification - * is protected by a lock internal to this module. The fields and - * changes to those fields are specified in the array of xfs_mod_sb - * structures passed in. + * Change more than one field in the in-core superblock structure at a time. * - * Either all of the specified deltas will be applied or none of - * them will. If any modified field dips below 0, then all modifications - * will be backed out and EINVAL will be returned. + * The fields and changes to those fields are specified in the array of + * xfs_mod_sb structures passed in. Either all of the specified deltas + * will be applied or none of them will. If any modified field dips below 0, + * then all modifications will be backed out and EINVAL will be returned. + * + * Note that this function may not be used for the superblock values that + * are tracked with the in-memory per-cpu counters - a direct call to + * xfs_icsb_modify_counters is required for these. */ int -xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) +xfs_mod_incore_sb_batch( + struct xfs_mount *mp, + xfs_mod_sb_t *msb, + uint nmsb, + int rsvd) { - int status=0; - xfs_mod_sb_t *msbp; + xfs_mod_sb_t *msbp = &msb[0]; + int error = 0; /* - * Loop through the array of mod structures and apply each - * individually. If any fail, then back out all those - * which have already been applied. Do all of this within - * the scope of the m_sb_lock so that all of the changes will - * be atomic. + * Loop through the array of mod structures and apply each individually. + * If any fail, then back out all those which have already been applied. + * Do all of this within the scope of the m_sb_lock so that all of the + * changes will be atomic. */ spin_lock(&mp->m_sb_lock); - msbp = &msb[0]; for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { - /* - * Apply the delta at index n. If it fails, break - * from the loop so we'll fall into the undo loop - * below. - */ - switch (msbp->msb_field) { -#ifdef HAVE_PERCPU_SB - case XFS_SBS_ICOUNT: - case XFS_SBS_IFREE: - case XFS_SBS_FDBLOCKS: - spin_unlock(&mp->m_sb_lock); - status = xfs_icsb_modify_counters(mp, - msbp->msb_field, - msbp->msb_delta, rsvd); - spin_lock(&mp->m_sb_lock); - break; -#endif - default: - status = xfs_mod_incore_sb_unlocked(mp, - msbp->msb_field, - msbp->msb_delta, rsvd); - break; - } + ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || + msbp->msb_field > XFS_SBS_FDBLOCKS); - if (status != 0) { - break; - } - } - - /* - * If we didn't complete the loop above, then back out - * any changes made to the superblock. If you add code - * between the loop above and here, make sure that you - * preserve the value of status. Loop back until - * we step below the beginning of the array. Make sure - * we don't touch anything back there. - */ - if (status != 0) { - msbp--; - while (msbp >= msb) { - switch (msbp->msb_field) { -#ifdef HAVE_PERCPU_SB - case XFS_SBS_ICOUNT: - case XFS_SBS_IFREE: - case XFS_SBS_FDBLOCKS: - spin_unlock(&mp->m_sb_lock); - status = xfs_icsb_modify_counters(mp, - msbp->msb_field, - -(msbp->msb_delta), - rsvd); - spin_lock(&mp->m_sb_lock); - break; -#endif - default: - status = xfs_mod_incore_sb_unlocked(mp, - msbp->msb_field, - -(msbp->msb_delta), - rsvd); - break; - } - ASSERT(status == 0); - msbp--; - } + error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, + msbp->msb_delta, rsvd); + if (error) + goto unwind; } spin_unlock(&mp->m_sb_lock); - return status; + return 0; + +unwind: + while (--msbp >= msb) { + error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, + -msbp->msb_delta, rsvd); + ASSERT(error == 0); + } + spin_unlock(&mp->m_sb_lock); + return error; } /* @@ -2478,7 +2434,7 @@ xfs_icsb_balance_counter( spin_unlock(&mp->m_sb_lock); } -STATIC int +int xfs_icsb_modify_counters( xfs_mount_t *mp, xfs_sb_field_t field, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 73cf8f45725e..5fab0e6bf86e 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1009,7 +1009,7 @@ void xfs_trans_unreserve_and_mod_sb( xfs_trans_t *tp) { - xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ + xfs_mod_sb_t msb[9]; /* If you add cases, add entries */ xfs_mod_sb_t *msbp; xfs_mount_t *mp = tp->t_mountp; /* REFERENCED */ @@ -1017,55 +1017,61 @@ xfs_trans_unreserve_and_mod_sb( int rsvd; int64_t blkdelta = 0; int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; msbp = msb; rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - /* calculate free blocks delta */ + /* calculate deltas */ if (tp->t_blk_res > 0) blkdelta = tp->t_blk_res; - if ((tp->t_fdblocks_delta != 0) && (xfs_sb_version_haslazysbcount(&mp->m_sb) || (tp->t_flags & XFS_TRANS_SB_DIRTY))) blkdelta += tp->t_fdblocks_delta; - if (blkdelta != 0) { - msbp->msb_field = XFS_SBS_FDBLOCKS; - msbp->msb_delta = blkdelta; - msbp++; - } - - /* calculate free realtime extents delta */ if (tp->t_rtx_res > 0) rtxdelta = tp->t_rtx_res; - if ((tp->t_frextents_delta != 0) && (tp->t_flags & XFS_TRANS_SB_DIRTY)) rtxdelta += tp->t_frextents_delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY)) { + idelta = tp->t_icount_delta; + ifreedelta = tp->t_ifree_delta; + } + + /* apply the per-cpu counters */ + if (blkdelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + blkdelta, rsvd); + if (error) + goto out; + } + + if (idelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, + idelta, rsvd); + if (error) + goto out_undo_fdblocks; + } + + if (ifreedelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, + ifreedelta, rsvd); + if (error) + goto out_undo_icount; + } + + /* apply remaining deltas */ if (rtxdelta != 0) { msbp->msb_field = XFS_SBS_FREXTENTS; msbp->msb_delta = rtxdelta; msbp++; } - /* apply remaining deltas */ - - if (xfs_sb_version_haslazysbcount(&mp->m_sb) || - (tp->t_flags & XFS_TRANS_SB_DIRTY)) { - if (tp->t_icount_delta != 0) { - msbp->msb_field = XFS_SBS_ICOUNT; - msbp->msb_delta = tp->t_icount_delta; - msbp++; - } - if (tp->t_ifree_delta != 0) { - msbp->msb_field = XFS_SBS_IFREE; - msbp->msb_delta = tp->t_ifree_delta; - msbp++; - } - } - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { if (tp->t_dblocks_delta != 0) { msbp->msb_field = XFS_SBS_DBLOCKS; @@ -1115,8 +1121,24 @@ xfs_trans_unreserve_and_mod_sb( if (msbp > msb) { error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, (uint)(msbp - msb), rsvd); - ASSERT(error == 0); + if (error) + goto out_undo_ifreecount; } + + return; + +out_undo_ifreecount: + if (ifreedelta) + xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd); +out_undo_icount: + if (idelta) + xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd); +out_undo_fdblocks: + if (blkdelta) + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); +out: + ASSERT(error = 0); + return; } /* From d276734d937a649ff43fd197d0df7a747bd55b7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 18:31:23 +0000 Subject: [PATCH 27/36] xfs: fix bogus m_maxagi check in xfs_iget These days inode64 should only control which AGs we allocate new inodes from, while we still try to support reading all existing inodes. To make this actually work the check ontop of xfs_iget needs to be relaxed to allow inodes in all allocation groups instead of just those that we allow allocating inodes from. Note that we can't simply remove the check - it prevents us from accessing invalid data when fed invalid inode numbers from NFS or bulkstat. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_iget.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index b1ecc6f97ade..0cdd26932d8e 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -365,8 +365,8 @@ xfs_iget( xfs_perag_t *pag; xfs_agino_t agino; - /* the radix tree exists only in inode capable AGs */ - if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) + /* reject inode numbers outside existing AGs */ + if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return EINVAL; /* get the perag structure and ensure that it's inode capable */ From dfe188d4283752086d48380cde40d9801c318667 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 18:41:12 +0000 Subject: [PATCH 28/36] xfs: remove unused t_callback field in struct xfs_trans Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_trans.c | 4 ---- fs/xfs/xfs_trans.h | 2 -- 2 files changed, 6 deletions(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 5fab0e6bf86e..50bd4acefc00 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1416,10 +1416,6 @@ xfs_trans_committed( { struct xfs_log_item_desc *lidp, *next; - /* Call the transaction's completion callback if there is one. */ - if (tp->t_callback != NULL) - tp->t_callback(tp, tp->t_callarg); - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); xfs_trans_free_item_desc(lidp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e2cbe4da7d8e..246286b77a86 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -399,8 +399,6 @@ typedef struct xfs_trans { * transaction. */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ - xfs_trans_callback_t t_callback; /* transaction callback */ - void *t_callarg; /* callback arg */ unsigned int t_flags; /* misc flags */ int64_t t_icount_delta; /* superblock icount change */ int64_t t_ifree_delta; /* superblock ifree change */ From 4957a449a1bce2f5095f57f84114dc038a8f08d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 18:41:13 +0000 Subject: [PATCH 29/36] xfs: fix the xfs_trans_committed Use the correct prototype for xfs_trans_committed instead of casting it. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_trans.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 50bd4acefc00..f6d956b7711e 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1411,9 +1411,10 @@ xfs_trans_item_committed( */ STATIC void xfs_trans_committed( - struct xfs_trans *tp, + void *arg, int abortflag) { + struct xfs_trans *tp = arg; struct xfs_log_item_desc *lidp, *next; list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { @@ -1543,7 +1544,7 @@ xfs_trans_commit_iclog( * running in simulation mode (the log is explicitly turned * off). */ - tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; + tp->t_logcb.cb_func = xfs_trans_committed; tp->t_logcb.cb_arg = tp; /* From 1ae4fe6dba24ebabcd12cd0fa45cc5955394cbd8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 18:41:14 +0000 Subject: [PATCH 30/36] xfs: remove xfs_refcache.h This header has been completely unused for a couple of years. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_refcache.h | 52 ------------------------------------------- 1 file changed, 52 deletions(-) delete mode 100644 fs/xfs/xfs_refcache.h diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h deleted file mode 100644 index 2dec79edb510..000000000000 --- a/fs/xfs/xfs_refcache.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_REFCACHE_H__ -#define __XFS_REFCACHE_H__ - -#ifdef HAVE_REFCACHE -/* - * Maximum size (in inodes) for the NFS reference cache - */ -#define XFS_REFCACHE_SIZE_MAX 512 - -struct xfs_inode; -struct xfs_mount; - -extern void xfs_refcache_insert(struct xfs_inode *); -extern void xfs_refcache_purge_ip(struct xfs_inode *); -extern void xfs_refcache_purge_mp(struct xfs_mount *); -extern void xfs_refcache_purge_some(struct xfs_mount *); -extern void xfs_refcache_resize(int); -extern void xfs_refcache_destroy(void); - -extern void xfs_refcache_iunlock(struct xfs_inode *, uint); - -#else - -#define xfs_refcache_insert(ip) do { } while (0) -#define xfs_refcache_purge_ip(ip) do { } while (0) -#define xfs_refcache_purge_mp(mp) do { } while (0) -#define xfs_refcache_purge_some(mp) do { } while (0) -#define xfs_refcache_resize(size) do { } while (0) -#define xfs_refcache_destroy() do { } while (0) - -#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags) - -#endif - -#endif /* __XFS_REFCACHE_H__ */ From 668332e5fec809bb100da619fda80e033b12b4a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 18:41:15 +0000 Subject: [PATCH 31/36] xfs: remove xfs_version.h It used to have a place when it contained an automatically generated CVS version, but these days it's entirely superflous. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_super.c | 1 - fs/xfs/linux-2.6/xfs_super.h | 1 + fs/xfs/linux-2.6/xfs_version.h | 29 ----------------------------- 3 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 fs/xfs/linux-2.6/xfs_version.h diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a316c7316702..48d5f8206549 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -44,7 +44,6 @@ #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_vnodeops.h" -#include "xfs_version.h" #include "xfs_log_priv.h" #include "xfs_trans_priv.h" #include "xfs_filestream.h" diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 1ef4a4d2d997..50a3266c999e 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -62,6 +62,7 @@ extern void xfs_qm_exit(void); # define XFS_DBG_STRING "no debug" #endif +#define XFS_VERSION_STRING "SGI XFS" #define XFS_BUILD_OPTIONS XFS_ACL_STRING \ XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h deleted file mode 100644 index f8d279d7563a..000000000000 --- a/fs/xfs/linux-2.6/xfs_version.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_VERSION_H__ -#define __XFS_VERSION_H__ - -/* - * Dummy file that can contain a timestamp to put into the - * XFS init string, to help users keep track of what they're - * running - */ - -#define XFS_VERSION_STRING "SGI XFS" - -#endif /* __XFS_VERSION_H__ */ From 78a4b0961ff241c6f23b16329db0d67e97cb86a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 18:41:16 +0000 Subject: [PATCH 32/36] xfs: remove xfs_globals.h This header only provides one extern that isn't actually declared anywhere, and shadowed by a macro. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_globals.h | 23 ----------------------- fs/xfs/linux-2.6/xfs_linux.h | 1 - 2 files changed, 24 deletions(-) delete mode 100644 fs/xfs/linux-2.6/xfs_globals.h diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h deleted file mode 100644 index 69f71caf061c..000000000000 --- a/fs/xfs/linux-2.6/xfs_globals.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_GLOBALS_H__ -#define __XFS_GLOBALS_H__ - -extern uint64_t xfs_panic_mask; /* set to cause more panics */ - -#endif /* __XFS_GLOBALS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 2fa0bd9ebc7f..1a2afcd87ef1 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -86,7 +86,6 @@ #include #include #include -#include #include /* From 6c77b0ea1bdf85dfd48c20ceb10fd215a95c66e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 18:41:17 +0000 Subject: [PATCH 33/36] xfs: remove xfs_cred.h We're not actually passing around credentials inside XFS for a while now, so remove all xfs_cred.h with it's cred_t typedef and all instances of it. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_cred.h | 28 ---------------------------- fs/xfs/linux-2.6/xfs_globals.c | 1 - fs/xfs/linux-2.6/xfs_iops.c | 4 ++-- fs/xfs/linux-2.6/xfs_linux.h | 2 +- fs/xfs/quota/xfs_qm.c | 8 +++----- fs/xfs/xfs_inode.c | 1 - fs/xfs/xfs_inode.h | 5 ++--- fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_utils.c | 5 ++--- fs/xfs/xfs_utils.h | 3 +-- fs/xfs/xfs_vnodeops.c | 12 +++++------- fs/xfs/xfs_vnodeops.h | 6 ++---- 12 files changed, 18 insertions(+), 58 deletions(-) delete mode 100644 fs/xfs/linux-2.6/xfs_cred.h diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h deleted file mode 100644 index 55bddf3b6091..000000000000 --- a/fs/xfs/linux-2.6/xfs_cred.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_CRED_H__ -#define __XFS_CRED_H__ - -#include - -/* - * Credentials - */ -typedef const struct cred cred_t; - -#endif /* __XFS_CRED_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c index 2ae8b1ccb02e..76e81cff70b9 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -16,7 +16,6 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_cred.h" #include "xfs_sysctl.h" /* diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index a788f016d1fa..ec858e09d546 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -189,7 +189,7 @@ xfs_vn_mknod( } xfs_dentry_to_name(&name, dentry); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); if (unlikely(error)) goto out_free_acl; @@ -362,7 +362,7 @@ xfs_vn_symlink( (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); xfs_dentry_to_name(&name, dentry); - error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); + error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) goto out; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 1a2afcd87ef1..76ebc582dba0 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -79,7 +80,6 @@ #include #include -#include #include #include #include diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 121f632213a7..a3f8f95d33ea 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -55,8 +55,6 @@ uint ndquot; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; -static cred_t xfs_zerocr; - STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); @@ -1224,8 +1222,8 @@ xfs_qm_qino_alloc( return error; } - if ((error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, - &xfs_zerocr, 0, 1, ip, &committed))) { + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); + if (error) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); return error; @@ -2143,7 +2141,7 @@ xfs_qm_write_sb_changes( /* - * Given an inode, a uid and gid (from cred_t) make sure that we have + * Given an inode, a uid, gid and prid make sure that we have * allocated relevant dquot(s) on disk, and that we won't exceed inode * quotas by creating this file. * This also attaches dquot(s) to the given inode after locking it, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 34798f391c49..4fc72c9e3729 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -982,7 +982,6 @@ xfs_ialloc( mode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, - cred_t *cr, xfs_prid_t prid, int okalloc, xfs_buf_t **ialloc_context, diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 54781fcd322a..ac82327ec9db 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -212,7 +212,6 @@ typedef struct xfs_icdinode { #ifdef __KERNEL__ struct bhv_desc; -struct cred; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; @@ -456,8 +455,8 @@ void xfs_inode_free(struct xfs_inode *ip); * xfs_inode.c prototypes. */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, - xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, - int, struct xfs_buf **, boolean_t *, xfs_inode_t **); + xfs_nlink_t, xfs_dev_t, xfs_prid_t, int, + struct xfs_buf **, boolean_t *, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); uint xfs_dic2xflags(struct xfs_dinode *); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6509c0743957..5861b4980740 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -53,7 +53,6 @@ typedef struct xfs_trans_reservations { #include "xfs_sync.h" -struct cred; struct log; struct xfs_mount_args; struct xfs_inode; diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 4c2ba6f6adc8..8b32d1a4c5a1 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -56,7 +56,6 @@ xfs_dir_ialloc( mode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, - cred_t *credp, prid_t prid, /* project id */ int okalloc, /* ok to allocate new space */ xfs_inode_t **ipp, /* pointer to inode; it will be @@ -93,7 +92,7 @@ xfs_dir_ialloc( * transaction commit so that no other process can steal * the inode(s) that we've just allocated. */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc, + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, &ialloc_context, &call_again, &ip); /* @@ -197,7 +196,7 @@ xfs_dir_ialloc( * other allocations in this allocation group, * this call should always succeed. */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, &ialloc_context, &call_again, &ip); /* diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index f55b9678264f..456fca314933 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -19,8 +19,7 @@ #define __XFS_UTILS_H__ extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, - xfs_dev_t, cred_t *, prid_t, int, - xfs_inode_t **, int *); + xfs_dev_t, prid_t, int, xfs_inode_t **, int *); extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *); extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *); extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index b7bdc43308e4..f82c8032db52 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1256,8 +1256,7 @@ xfs_create( struct xfs_name *name, mode_t mode, xfs_dev_t rdev, - xfs_inode_t **ipp, - cred_t *credp) + xfs_inode_t **ipp) { int is_dir = S_ISDIR(mode); struct xfs_mount *mp = dp->i_mount; @@ -1363,7 +1362,7 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp, + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, resblks > 0, &ip, &committed); if (error) { if (error == ENOSPC) @@ -1936,8 +1935,7 @@ xfs_symlink( struct xfs_name *link_name, const char *target_path, mode_t mode, - xfs_inode_t **ipp, - cred_t *credp) + xfs_inode_t **ipp) { xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp; @@ -2049,8 +2047,8 @@ xfs_symlink( /* * Allocate an inode for the symlink. */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), - 1, 0, credp, prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); if (error) { if (error == ENOSPC) goto error_return; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index d8dfa8d0dadd..f6702927eee4 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -2,7 +2,6 @@ #define _XFS_VNODEOPS_H 1 struct attrlist_cursor_kern; -struct cred; struct file; struct iattr; struct inode; @@ -26,7 +25,7 @@ int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode, - xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp); + xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, @@ -34,8 +33,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, - const char *target_path, mode_t mode, struct xfs_inode **ipp, - cred_t *credp); + const char *target_path, mode_t mode, struct xfs_inode **ipp); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_change_file_space(struct xfs_inode *ip, int cmd, xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); From 1a1a3e97bad42e92cd2f32e81c396c8ee0bddb28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Oct 2010 18:41:18 +0000 Subject: [PATCH 34/36] xfs: remove xfs_buf wrappers Stop having two different names for many buffer functions and use the more descriptive xfs_buf_* names directly. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 19 +++++++++---------- fs/xfs/linux-2.6/xfs_buf.h | 20 +++----------------- fs/xfs/linux-2.6/xfs_super.c | 2 +- fs/xfs/quota/xfs_qm.c | 2 +- fs/xfs/xfs_attr.c | 6 +++--- fs/xfs/xfs_btree.c | 4 ++-- fs/xfs/xfs_buf_item.c | 4 ++-- fs/xfs/xfs_da_btree.c | 2 +- fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 6 +++--- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_trans_buf.c | 2 +- fs/xfs/xfs_vnodeops.c | 4 ++-- 16 files changed, 33 insertions(+), 48 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 749d7d39d657..47ef97f46fe9 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -652,8 +652,7 @@ void xfs_buf_readahead( xfs_buftarg_t *target, xfs_off_t ioff, - size_t isize, - xfs_buf_flags_t flags) + size_t isize) { struct backing_dev_info *bdi; @@ -661,8 +660,8 @@ xfs_buf_readahead( if (bdi_read_congested(bdi)) return; - flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); - xfs_buf_read(target, ioff, isize, flags); + xfs_buf_read(target, ioff, isize, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); } /* @@ -691,7 +690,7 @@ xfs_buf_read_uncached( XFS_BUF_BUSY(bp); xfsbdstrat(mp, bp); - error = xfs_iowait(bp); + error = xfs_buf_iowait(bp); if (error || bp->b_error) { xfs_buf_relse(bp); return NULL; @@ -1073,7 +1072,7 @@ xfs_bdwrite( /* * Called when we want to stop a buffer from getting written or read. - * We attach the EIO error, muck with its flags, and call biodone + * We attach the EIO error, muck with its flags, and call xfs_buf_ioend * so that the proper iodone callbacks get called. */ STATIC int @@ -1090,21 +1089,21 @@ xfs_bioerror( XFS_BUF_ERROR(bp, EIO); /* - * We're calling biodone, so delete XBF_DONE flag. + * We're calling xfs_buf_ioend, so delete XBF_DONE flag. */ XFS_BUF_UNREAD(bp); XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); - xfs_biodone(bp); + xfs_buf_ioend(bp, 0); return EIO; } /* * Same as xfs_bioerror, except that we are releasing the buffer - * here ourselves, and avoiding the biodone call. + * here ourselves, and avoiding the xfs_buf_ioend call. * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ @@ -1938,7 +1937,7 @@ xfs_flush_buftarg( bp = list_first_entry(&wait_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); - xfs_iowait(bp); + xfs_buf_iowait(bp); xfs_buf_relse(bp); } } diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 161333785f69..131c0ebf2c0d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -218,8 +218,7 @@ extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); extern void xfs_buf_hold(xfs_buf_t *); -extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); +extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t); struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, struct xfs_buftarg *target, xfs_daddr_t daddr, size_t length, int flags); @@ -247,6 +246,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); +#define xfs_buf_zero(bp, off, len) \ + xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) static inline int xfs_buf_geterror(xfs_buf_t *bp) { @@ -359,21 +360,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) xfs_buf_rele(bp); } -#define xfs_biodone(bp) xfs_buf_ioend(bp, 0) - -#define xfs_biomove(bp, off, len, data, rw) \ - xfs_buf_iomove((bp), (off), (len), (data), \ - ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ) - -#define xfs_biozero(bp, off, len) \ - xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) - -#define xfs_iowait(bp) xfs_buf_iowait(bp) - -#define xfs_baread(target, rablkno, ralen) \ - xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK) - - /* * Handling of buftargs. */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 48d5f8206549..fa1e40ac4b35 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -644,7 +644,7 @@ xfs_barrier_test( XFS_BUF_ORDERED(sbp); xfsbdstrat(mp, sbp); - error = xfs_iowait(sbp); + error = xfs_buf_iowait(sbp); /* * Clear all the flags we set and possible error state in the diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index a3f8f95d33ea..d109cc557bed 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1433,7 +1433,7 @@ xfs_qm_dqiterate( rablkcnt = map[i+1].br_blockcount; rablkno = map[i+1].br_startblock; while (rablkcnt--) { - xfs_baread(mp->m_ddev_targp, + xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), mp->m_quotainfo->qi_dqchunklen); rablkno++; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 905d390c1e5c..c86375378810 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -1986,7 +1986,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); - xfs_biomove(bp, 0, tmp, dst, XBF_READ); + xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); xfs_buf_relse(bp); dst += tmp; valuelen -= tmp; @@ -2116,9 +2116,9 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); - xfs_biomove(bp, 0, tmp, src, XBF_WRITE); + xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); if (tmp < XFS_BUF_SIZE(bp)) - xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); + xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ return (error); } diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 735dc2e671b1..04f9cca8da7e 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -656,7 +656,7 @@ xfs_btree_reada_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); } /* @@ -676,7 +676,7 @@ xfs_btree_reada_bufs( ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); } STATIC int diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ee7557611b6e..2686d0d54c5b 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -973,7 +973,7 @@ xfs_buf_iodone_callbacks( xfs_buf_do_callbacks(bp, lip); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_biodone(bp); + xfs_buf_ioend(bp, 0); return; } @@ -1032,7 +1032,7 @@ xfs_buf_iodone_callbacks( xfs_buf_do_callbacks(bp, lip); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_biodone(bp); + xfs_buf_ioend(bp, 0); } /* diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 30fa0e206fba..1c00bedb3175 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2042,7 +2042,7 @@ xfs_da_do_buf( mappedbno, nmapped, 0, &bp); break; case 3: - xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); + xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped); error = 0; bp = NULL; break; diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 504be8640e91..ae891223be90 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -961,7 +961,7 @@ xfs_dir2_leaf_getdents( if (i > ra_current && map[ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_baread(mp->m_ddev_targp, + xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map[ra_index].br_startblock + ra_offset), diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5371d2dc360e..0626a32c3447 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -212,7 +212,7 @@ xfs_ialloc_inode_init( * to log a whole cluster of inodes instead of all the * individual transactions causing a lot of log traffic. */ - xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); + xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { int ioffset = i << mp->m_sb.sb_inodelog; uint isize = sizeof(struct xfs_dinode); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4fc72c9e3729..493d6b0cbef2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2724,7 +2724,7 @@ cluster_corrupt_out: XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); XFS_BUF_ERROR(bp,EIO); - xfs_biodone(bp); + xfs_buf_ioend(bp, 0); } else { XFS_BUF_STALE(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8a309424307..f4fd49c9b987 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1310,7 +1310,7 @@ xlog_bdstrat( if (iclog->ic_state & XLOG_STATE_IOERROR) { XFS_BUF_ERROR(bp, EIO); XFS_BUF_STALE(bp); - xfs_biodone(bp); + xfs_buf_ioend(bp, 0); /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 351d71117f16..966d3f97458c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -168,7 +168,7 @@ xlog_bread_noalign( XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); xfsbdstrat(log->l_mp, bp); - error = xfs_iowait(bp); + error = xfs_buf_iowait(bp); if (error) xfs_ioerror_alert("xlog_bread", log->l_mp, bp, XFS_BUF_ADDR(bp)); @@ -328,7 +328,7 @@ xlog_recover_iodone( SHUTDOWN_META_IO_ERROR); } XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_biodone(bp); + xfs_buf_ioend(bp, 0); } /* @@ -3816,7 +3816,7 @@ xlog_do_recover( XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); xfsbdstrat(log->l_mp, bp); - error = xfs_iowait(bp); + error = xfs_buf_iowait(bp); if (error) { xfs_ioerror_alert("xlog_do_recover", log->l_mp, bp, XFS_BUF_ADDR(bp)); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 00538b7e694a..b1498ab5a399 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1607,7 +1607,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNASYNC(sbp); ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); xfsbdstrat(mp, sbp); - error = xfs_iowait(sbp); + error = xfs_buf_iowait(sbp); if (error) xfs_ioerror_alert("xfs_unmountfs_writesb", mp, sbp, XFS_BUF_ADDR(sbp)); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 90af025e6839..c47918c302a5 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -336,7 +336,7 @@ xfs_trans_read_buf( ASSERT(!XFS_BUF_ISASYNC(bp)); XFS_BUF_READ(bp); xfsbdstrat(tp->t_mountp, bp); - error = xfs_iowait(bp); + error = xfs_buf_iowait(bp); if (error) { xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index f82c8032db52..f4195f6b8bc4 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2460,7 +2460,7 @@ xfs_zero_remaining_bytes( XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); xfsbdstrat(mp, bp); - error = xfs_iowait(bp); + error = xfs_buf_iowait(bp); if (error) { xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", mp, bp, XFS_BUF_ADDR(bp)); @@ -2473,7 +2473,7 @@ xfs_zero_remaining_bytes( XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); xfsbdstrat(mp, bp); - error = xfs_iowait(bp); + error = xfs_buf_iowait(bp); if (error) { xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", mp, bp, XFS_BUF_ADDR(bp)); From 6743099ce57a40509a86849a22317ed4b7516911 Mon Sep 17 00:00:00 2001 From: Arkadiusz Mi?kiewicz Date: Sun, 26 Sep 2010 06:10:18 +0000 Subject: [PATCH 35/36] xfs: Extend project quotas to support 32bit project ids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for 32bit project quota identifiers. On disk format is backward compatible with 16bit projid numbers. projid on disk is now kept in two 16bit values - di_projid_lo (which holds the same position as old 16bit projid value) and new di_projid_hi (takes existing padding) and converts from/to 32bit value on the fly. xfs_admin (for existing fs), mkfs.xfs (for new fs) needs to be used to enable PROJID32BIT support. Signed-off-by: Arkadiusz Miƛkiewicz Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_ioctl.c | 14 +++++++------- fs/xfs/linux-2.6/xfs_ioctl32.c | 4 +++- fs/xfs/linux-2.6/xfs_ioctl32.h | 6 ++++-- fs/xfs/linux-2.6/xfs_linux.h | 2 +- fs/xfs/quota/xfs_qm.c | 10 +++++----- fs/xfs/quota/xfs_qm_bhv.c | 2 +- fs/xfs/quota/xfs_qm_syscalls.c | 2 +- fs/xfs/xfs_dinode.h | 5 +++-- fs/xfs/xfs_fs.h | 6 ++++-- fs/xfs/xfs_inode.c | 14 ++++++++------ fs/xfs/xfs_inode.h | 26 +++++++++++++++++++++++--- fs/xfs/xfs_itable.c | 3 ++- fs/xfs/xfs_rename.c | 2 +- fs/xfs/xfs_sb.h | 10 +++++++++- fs/xfs/xfs_types.h | 2 -- fs/xfs/xfs_vnodeops.c | 16 ++++++++-------- 16 files changed, 80 insertions(+), 44 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 10206be7a077..2ea238f6d38e 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -790,7 +790,7 @@ xfs_ioc_fsgetxattr( xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; - fa.fsx_projid = ip->i_d.di_projid; + fa.fsx_projid = xfs_get_projid(ip); if (attr) { if (ip->i_afp) { @@ -909,10 +909,10 @@ xfs_ioctl_setattr( return XFS_ERROR(EIO); /* - * Disallow 32bit project ids because on-disk structure - * is 16bit only. + * Disallow 32bit project ids when projid32bit feature is not enabled. */ - if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) + if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && + !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) return XFS_ERROR(EINVAL); /* @@ -961,7 +961,7 @@ xfs_ioctl_setattr( if (mask & FSX_PROJID) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && - ip->i_d.di_projid != fa->fsx_projid) { + xfs_get_projid(ip) != fa->fsx_projid) { ASSERT(tp); code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, capable(CAP_FOWNER) ? @@ -1063,12 +1063,12 @@ xfs_ioctl_setattr( * Change the ownerships and register quota modifications * in the transaction. */ - if (ip->i_d.di_projid != fa->fsx_projid) { + if (xfs_get_projid(ip) != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_projid = fa->fsx_projid; + xfs_set_projid(ip, fa->fsx_projid); /* * We may have to rev the inode as well as diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 464bcc76e980..b3486dfa5520 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin( get_user(bstat->bs_extsize, &bstat32->bs_extsize) || get_user(bstat->bs_extents, &bstat32->bs_extents) || get_user(bstat->bs_gen, &bstat32->bs_gen) || - get_user(bstat->bs_projid, &bstat32->bs_projid) || + get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || + get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || get_user(bstat->bs_aextents, &bstat32->bs_aextents)) @@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat( put_user(buffer->bs_extents, &p32->bs_extents) || put_user(buffer->bs_gen, &p32->bs_gen) || put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || put_user(buffer->bs_dmstate, &p32->bs_dmstate) || put_user(buffer->bs_aextents, &p32->bs_aextents)) diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h index 1024c4f8ba0d..08b605792a99 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.h +++ b/fs/xfs/linux-2.6/xfs_ioctl32.h @@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat { __s32 bs_extsize; /* extent size */ __s32 bs_extents; /* number of extents */ __u32 bs_gen; /* generation count */ - __u16 bs_projid; /* project id */ - unsigned char bs_pad[14]; /* pad space, unused */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_projid_hi; /* high part of project id */ + unsigned char bs_pad[12]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 76ebc582dba0..214ddd71ff79 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -143,7 +143,7 @@ #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) -#define dfltprid 0 +#define XFS_PROJID_DEFAULT 0 #define MAXPATHLEN 1024 #define MIN(a,b) (min(a,b)) diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index d109cc557bed..f8e854b4fde8 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -835,7 +835,7 @@ xfs_qm_dqattach_locked( xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, flags & XFS_QMOPT_DQALLOC, ip->i_udquot, &ip->i_gdquot) : - xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, + xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, flags & XFS_QMOPT_DQALLOC, ip->i_udquot, &ip->i_gdquot); /* @@ -1630,7 +1630,7 @@ xfs_qm_dqusage_adjust( } if (XFS_IS_PQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_projid, + error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip), XFS_DQ_PROJ, nblks, rtblks); if (error) goto error0; @@ -2249,7 +2249,7 @@ xfs_qm_vop_dqalloc( xfs_dqunlock(gq); } } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { - if (ip->i_d.di_projid != prid) { + if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, XFS_DQ_PROJ, @@ -2371,7 +2371,7 @@ xfs_qm_vop_chown_reserve( } if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { if (XFS_IS_PQUOTA_ON(ip->i_mount) && - ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) + xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id)) prjflags = XFS_QMOPT_ENOSPC; if (prjflags || @@ -2475,7 +2475,7 @@ xfs_qm_vop_create_dqattach( ip->i_gdquot = gdqp; ASSERT(XFS_IS_OQUOTA_ON(mp)); ASSERT((XFS_IS_GQUOTA_ON(mp) ? - ip->i_d.di_gid : ip->i_d.di_projid) == + ip->i_d.di_gid : xfs_get_projid(ip)) == be32_to_cpu(gdqp->q_core.d_id)); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index bea02d786c5d..45b5cb1788ab 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -81,7 +81,7 @@ xfs_qm_statvfs( xfs_mount_t *mp = ip->i_mount; xfs_dquot_t *dqp; - if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) { + if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { xfs_fill_statvfs_from_dquot(statp, &dqp->q_core); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 57847434fa51..bdebc183223e 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -1165,7 +1165,7 @@ xfs_qm_internalqcheck_adjust( } xfs_qm_internalqcheck_get_dquots(mp, (xfs_dqid_t) ip->i_d.di_uid, - (xfs_dqid_t) ip->i_d.di_projid, + (xfs_dqid_t) xfs_get_projid(ip), (xfs_dqid_t) ip->i_d.di_gid, &ud, &gd); if (XFS_IS_UQUOTA_ON(mp)) { diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index e5b153b2e6a3..dffba9ba0db6 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -49,8 +49,9 @@ typedef struct xfs_dinode { __be32 di_uid; /* owner's user id */ __be32 di_gid; /* owner's group id */ __be32 di_nlink; /* number of links to file */ - __be16 di_projid; /* owner's project id */ - __u8 di_pad[8]; /* unused, zeroed space */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ + __u8 di_pad[6]; /* unused, zeroed space */ __be16 di_flushiter; /* incremented on flush */ xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 10f6093671b0..8f6fc1a96386 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -293,9 +293,11 @@ typedef struct xfs_bstat { __s32 bs_extsize; /* extent size */ __s32 bs_extents; /* number of extents */ __u32 bs_gen; /* generation count */ - __u16 bs_projid; /* project id */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ __u16 bs_forkoff; /* inode fork offset in bytes */ - unsigned char bs_pad[12]; /* pad space, unused */ + __u16 bs_projid_hi; /* higher part of project id */ + unsigned char bs_pad[10]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 493d6b0cbef2..108c7a085f94 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -660,7 +660,8 @@ xfs_dinode_from_disk( to->di_uid = be32_to_cpu(from->di_uid); to->di_gid = be32_to_cpu(from->di_gid); to->di_nlink = be32_to_cpu(from->di_nlink); - to->di_projid = be16_to_cpu(from->di_projid); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); to->di_flushiter = be16_to_cpu(from->di_flushiter); to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); @@ -695,7 +696,8 @@ xfs_dinode_to_disk( to->di_uid = cpu_to_be32(from->di_uid); to->di_gid = cpu_to_be32(from->di_gid); to->di_nlink = cpu_to_be32(from->di_nlink); - to->di_projid = cpu_to_be16(from->di_projid); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); to->di_flushiter = cpu_to_be16(from->di_flushiter); to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); @@ -874,7 +876,7 @@ xfs_iread( if (ip->i_d.di_version == 1) { ip->i_d.di_nlink = ip->i_d.di_onlink; ip->i_d.di_onlink = 0; - ip->i_d.di_projid = 0; + xfs_set_projid(ip, 0); } ip->i_delayed_blks = 0; @@ -982,7 +984,7 @@ xfs_ialloc( mode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, - xfs_prid_t prid, + prid_t prid, int okalloc, xfs_buf_t **ialloc_context, boolean_t *call_again, @@ -1026,7 +1028,7 @@ xfs_ialloc( ASSERT(ip->i_d.di_nlink == nlink); ip->i_d.di_uid = current_fsuid(); ip->i_d.di_gid = current_fsgid(); - ip->i_d.di_projid = prid; + xfs_set_projid(ip, prid); memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); /* @@ -3007,7 +3009,7 @@ xfs_iflush_int( memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); memset(&(dip->di_pad[0]), 0, sizeof(dip->di_pad)); - ASSERT(ip->i_d.di_projid == 0); + ASSERT(xfs_get_projid(ip) == 0); } } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ac82327ec9db..fac52290de90 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -134,8 +134,9 @@ typedef struct xfs_icdinode { __uint32_t di_uid; /* owner's user id */ __uint32_t di_gid; /* owner's group id */ __uint32_t di_nlink; /* number of links to file */ - __uint16_t di_projid; /* owner's project id */ - __uint8_t di_pad[8]; /* unused, zeroed space */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ + __uint8_t di_pad[6]; /* unused, zeroed space */ __uint16_t di_flushiter; /* incremented on flush */ xfs_ictimestamp_t di_atime; /* time last accessed */ xfs_ictimestamp_t di_mtime; /* time last modified */ @@ -333,6 +334,25 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) return ret; } +/* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was choosen + * to retain compatibility with "old" filesystems). + */ +static inline prid_t +xfs_get_projid(struct xfs_inode *ip) +{ + return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo; +} + +static inline void +xfs_set_projid(struct xfs_inode *ip, + prid_t projid) +{ + ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16); + ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); +} + /* * Manage the i_flush queue embedded in the inode. This completion * queue synchronizes processes attempting to flush the in-core @@ -455,7 +475,7 @@ void xfs_inode_free(struct xfs_inode *ip); * xfs_inode.c prototypes. */ int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, - xfs_nlink_t, xfs_dev_t, xfs_prid_t, int, + xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, boolean_t *, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7e3626e5925c..dc1882adaf54 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -92,7 +92,8 @@ xfs_bulkstat_one_int( * further change. */ buf->bs_nlink = dic->di_nlink; - buf->bs_projid = dic->di_projid; + buf->bs_projid_lo = dic->di_projid_lo; + buf->bs_projid_hi = dic->di_projid_hi; buf->bs_ino = ino; buf->bs_mode = dic->di_mode; buf->bs_uid = dic->di_uid; diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 9028733f7ed8..d2af0a8381a6 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -183,7 +183,7 @@ xfs_rename( * tree quota mechanism would be circumvented. */ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { + (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { error = XFS_ERROR(EXDEV); goto error_return; } diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 1b017c657494..1eb2ba586814 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -80,10 +80,12 @@ struct xfs_mount; #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ #define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ +#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ - XFS_SB_VERSION2_ATTR2BIT) + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT) #define XFS_SB_VERSION2_OKSASHFBITS \ (0) #define XFS_SB_VERSION2_OKREALBITS \ @@ -495,6 +497,12 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; } +static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) +{ + return xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); +} + /* * end of superblock version macros */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 320775295e32..26d1867d8156 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */ typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ -typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ - typedef __uint32_t xlog_tid_t; /* transaction ID type */ /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index f4195f6b8bc4..8e4a63c4151a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -114,7 +114,7 @@ xfs_setattr( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, + code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), qflags, &udqp, &gdqp); if (code) return code; @@ -1268,7 +1268,7 @@ xfs_create( boolean_t unlock_dp_on_error = B_FALSE; uint cancel_flags; int committed; - xfs_prid_t prid; + prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; uint resblks; @@ -1281,9 +1281,9 @@ xfs_create( return XFS_ERROR(EIO); if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = dp->i_d.di_projid; + prid = xfs_get_projid(dp); else - prid = dfltprid; + prid = XFS_PROJID_DEFAULT; /* * Make sure that we have allocated dquot(s) on disk. @@ -1882,7 +1882,7 @@ xfs_link( * the tree quota mechanism could be circumvented. */ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (tdp->i_d.di_projid != sip->i_d.di_projid))) { + (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { error = XFS_ERROR(EXDEV); goto error_return; } @@ -1956,7 +1956,7 @@ xfs_symlink( int byte_cnt; int n; xfs_buf_t *bp; - xfs_prid_t prid; + prid_t prid; struct xfs_dquot *udqp, *gdqp; uint resblks; @@ -1979,9 +1979,9 @@ xfs_symlink( udqp = gdqp = NULL; if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = dp->i_d.di_projid; + prid = xfs_get_projid(dp); else - prid = (xfs_prid_t)dfltprid; + prid = XFS_PROJID_DEFAULT; /* * Make sure that we have allocated dquot(s) on disk. From a731cd116c9334e01bcf3e676c0c621fe7de6ce4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Sep 2010 14:33:15 +0000 Subject: [PATCH 36/36] xfs: semaphore cleanup Get rid of init_MUTEX[_LOCKED]() and use sema_init() instead. (Ported to current XFS code by .) Signed-off-by: Thomas Gleixner Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 47ef97f46fe9..52785189212f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -189,7 +189,7 @@ _xfs_buf_initialize( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_list); RB_CLEAR_NODE(&bp->b_rbnode); - init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ + sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; bp->b_file_offset = range_base;