[XFS] Remove the xfs_icluster structure

Remove the xfs_icluster structure and replace with a radix tree lookup.

We don't need to keep a list of inodes in each cluster around anymore as
we can look them up quickly when we need to. The only time we need to do
this now is during inode writeback.

Factor the inode cluster writeback code out of xfs_iflush and convert it
to use radix_tree_gang_lookup() instead of walking a list of inodes built
when we first read in the inodes.

This remove 3 pointers from each xfs_inode structure and the xfs_icluster
structure per inode cluster. Hence we reduce the cache footprint of the
xfs_inodes by between 5-10% depending on cluster sparseness.

To be truly efficient we need a radix_tree_gang_lookup_range() call to
stop searching once we are past the end of the cluster instead of trying
to find a full cluster's worth of inodes.

Before (ia64):

$ cat /sys/slab/xfs_inode/object_size 536

After:

$ cat /sys/slab/xfs_inode/object_size 512

SGI-PV: 977460
SGI-Modid: xfs-linux-melb:xfs-kern:30502a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
This commit is contained in:
David Chinner 2008-03-06 13:43:49 +11:00 committed by Lachlan McIlroy
parent a3f74ffb6d
commit bad5584332
4 changed files with 156 additions and 182 deletions

View File

@ -78,7 +78,6 @@ xfs_iget_core(
xfs_inode_t *ip; xfs_inode_t *ip;
xfs_inode_t *iq; xfs_inode_t *iq;
int error; int error;
xfs_icluster_t *icl, *new_icl = NULL;
unsigned long first_index, mask; unsigned long first_index, mask;
xfs_perag_t *pag; xfs_perag_t *pag;
xfs_agino_t agino; xfs_agino_t agino;
@ -229,11 +228,9 @@ finish_inode:
} }
/* /*
* This is a bit messy - we preallocate everything we _might_ * Preload the radix tree so we can insert safely under the
* need before we pick up the ici lock. That way we don't have to * write spinlock.
* juggle locks and go all the way back to the start.
*/ */
new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
if (radix_tree_preload(GFP_KERNEL)) { if (radix_tree_preload(GFP_KERNEL)) {
xfs_idestroy(ip); xfs_idestroy(ip);
delay(1); delay(1);
@ -242,17 +239,6 @@ finish_inode:
mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
first_index = agino & mask; first_index = agino & mask;
write_lock(&pag->pag_ici_lock); write_lock(&pag->pag_ici_lock);
/*
* Find the cluster if it exists
*/
icl = NULL;
if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
first_index, 1)) {
if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
icl = iq->i_cluster;
}
/* /*
* insert the new inode * insert the new inode
*/ */
@ -267,30 +253,13 @@ finish_inode:
} }
/* /*
* These values _must_ be set before releasing ihlock! * These values _must_ be set before releasing the radix tree lock!
*/ */
ip->i_udquot = ip->i_gdquot = NULL; ip->i_udquot = ip->i_gdquot = NULL;
xfs_iflags_set(ip, XFS_INEW); xfs_iflags_set(ip, XFS_INEW);
ASSERT(ip->i_cluster == NULL);
if (!icl) {
spin_lock_init(&new_icl->icl_lock);
INIT_HLIST_HEAD(&new_icl->icl_inodes);
icl = new_icl;
new_icl = NULL;
} else {
ASSERT(!hlist_empty(&icl->icl_inodes));
}
spin_lock(&icl->icl_lock);
hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
ip->i_cluster = icl;
spin_unlock(&icl->icl_lock);
write_unlock(&pag->pag_ici_lock); write_unlock(&pag->pag_ici_lock);
radix_tree_preload_end(); radix_tree_preload_end();
if (new_icl)
kmem_zone_free(xfs_icluster_zone, new_icl);
/* /*
* Link ip to its mount and thread it on the mount's inode list. * Link ip to its mount and thread it on the mount's inode list.
@ -528,18 +497,6 @@ xfs_iextract(
write_unlock(&pag->pag_ici_lock); write_unlock(&pag->pag_ici_lock);
xfs_put_perag(mp, pag); xfs_put_perag(mp, pag);
/*
* Remove from cluster list
*/
mp = ip->i_mount;
spin_lock(&ip->i_cluster->icl_lock);
hlist_del(&ip->i_cnode);
spin_unlock(&ip->i_cluster->icl_lock);
/* was last inode in cluster? */
if (hlist_empty(&ip->i_cluster->icl_inodes))
kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
/* /*
* Remove from mount's inode list. * Remove from mount's inode list.
*/ */

View File

@ -55,7 +55,6 @@
kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_ifork_zone;
kmem_zone_t *xfs_inode_zone; kmem_zone_t *xfs_inode_zone;
kmem_zone_t *xfs_icluster_zone;
/* /*
* Used in xfs_itruncate(). This is the maximum number of extents * Used in xfs_itruncate(). This is the maximum number of extents
@ -2994,6 +2993,153 @@ xfs_iflush_fork(
return 0; return 0;
} }
STATIC int
xfs_iflush_cluster(
xfs_inode_t *ip,
xfs_buf_t *bp)
{
xfs_mount_t *mp = ip->i_mount;
xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
unsigned long first_index, mask;
int ilist_size;
xfs_inode_t **ilist;
xfs_inode_t *iq;
xfs_inode_log_item_t *iip;
int nr_found;
int clcount = 0;
int bufwasdelwri;
int i;
ASSERT(pag->pagi_inodeok);
ASSERT(pag->pag_ici_init);
ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
if (!ilist)
return 0;
mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
read_lock(&pag->pag_ici_lock);
/* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
first_index,
XFS_INODE_CLUSTER_SIZE(mp));
if (nr_found == 0)
goto out_free;
for (i = 0; i < nr_found; i++) {
iq = ilist[i];
if (iq == ip)
continue;
/* if the inode lies outside this cluster, we're done. */
if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
break;
/*
* Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated
* later after the appropriate locks are acquired.
*/
iip = iq->i_itemp;
if ((iq->i_update_core == 0) &&
((iip == NULL) ||
!(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
xfs_ipincount(iq) == 0) {
continue;
}
/*
* Try to get locks. If any are unavailable or it is pinned,
* then this inode cannot be flushed and is skipped.
*/
if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
continue;
if (!xfs_iflock_nowait(iq)) {
xfs_iunlock(iq, XFS_ILOCK_SHARED);
continue;
}
if (xfs_ipincount(iq)) {
xfs_ifunlock(iq);
xfs_iunlock(iq, XFS_ILOCK_SHARED);
continue;
}
/*
* arriving here means that this inode can be flushed. First
* re-check that it's dirty before flushing.
*/
iip = iq->i_itemp;
if ((iq->i_update_core != 0) || ((iip != NULL) &&
(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
int error;
error = xfs_iflush_int(iq, bp);
if (error) {
xfs_iunlock(iq, XFS_ILOCK_SHARED);
goto cluster_corrupt_out;
}
clcount++;
} else {
xfs_ifunlock(iq);
}
xfs_iunlock(iq, XFS_ILOCK_SHARED);
}
if (clcount) {
XFS_STATS_INC(xs_icluster_flushcnt);
XFS_STATS_ADD(xs_icluster_flushinode, clcount);
}
out_free:
read_unlock(&pag->pag_ici_lock);
kmem_free(ilist, ilist_size);
return 0;
cluster_corrupt_out:
/*
* Corruption detected in the clustering loop. Invalidate the
* inode buffer and shut down the filesystem.
*/
read_unlock(&pag->pag_ici_lock);
/*
* Clean up the buffer. If it was B_DELWRI, just release it --
* brelse can handle it with no problems. If not, shut down the
* filesystem before releasing the buffer.
*/
bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
if (bufwasdelwri)
xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
if (!bufwasdelwri) {
/*
* Just like incore_relse: if we have b_iodone functions,
* mark the buffer as an error and call them. Otherwise
* mark it as stale and brelse.
*/
if (XFS_BUF_IODONE_FUNC(bp)) {
XFS_BUF_CLR_BDSTRAT_FUNC(bp);
XFS_BUF_UNDONE(bp);
XFS_BUF_STALE(bp);
XFS_BUF_SHUT(bp);
XFS_BUF_ERROR(bp,EIO);
xfs_biodone(bp);
} else {
XFS_BUF_STALE(bp);
xfs_buf_relse(bp);
}
}
/*
* Unlocks the flush lock
*/
xfs_iflush_abort(iq);
kmem_free(ilist, ilist_size);
return XFS_ERROR(EFSCORRUPTED);
}
/* /*
* xfs_iflush() will write a modified inode's changes out to the * xfs_iflush() will write a modified inode's changes out to the
* inode's on disk home. The caller must have the inode lock held * inode's on disk home. The caller must have the inode lock held
@ -3013,13 +3159,8 @@ xfs_iflush(
xfs_dinode_t *dip; xfs_dinode_t *dip;
xfs_mount_t *mp; xfs_mount_t *mp;
int error; int error;
/* REFERENCED */
xfs_inode_t *iq;
int clcount; /* count of inodes clustered */
int bufwasdelwri;
struct hlist_node *entry;
enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK); int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
XFS_STATS_INC(xs_iflush_count); XFS_STATS_INC(xs_iflush_count);
@ -3138,9 +3279,8 @@ xfs_iflush(
* First flush out the inode that xfs_iflush was called with. * First flush out the inode that xfs_iflush was called with.
*/ */
error = xfs_iflush_int(ip, bp); error = xfs_iflush_int(ip, bp);
if (error) { if (error)
goto corrupt_out; goto corrupt_out;
}
/* /*
* If the buffer is pinned then push on the log now so we won't * If the buffer is pinned then push on the log now so we won't
@ -3153,70 +3293,9 @@ xfs_iflush(
* inode clustering: * inode clustering:
* see if other inodes can be gathered into this write * see if other inodes can be gathered into this write
*/ */
spin_lock(&ip->i_cluster->icl_lock); error = xfs_iflush_cluster(ip, bp);
ip->i_cluster->icl_buf = bp; if (error)
goto cluster_corrupt_out;
clcount = 0;
hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
if (iq == ip)
continue;
/*
* Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated
* later after the appropriate locks are acquired.
*/
iip = iq->i_itemp;
if ((iq->i_update_core == 0) &&
((iip == NULL) ||
!(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
xfs_ipincount(iq) == 0) {
continue;
}
/*
* Try to get locks. If any are unavailable,
* then this inode cannot be flushed and is skipped.
*/
/* get inode locks (just i_lock) */
if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
/* get inode flush lock */
if (xfs_iflock_nowait(iq)) {
/* check if pinned */
if (xfs_ipincount(iq) == 0) {
/* arriving here means that
* this inode can be flushed.
* first re-check that it's
* dirty
*/
iip = iq->i_itemp;
if ((iq->i_update_core != 0)||
((iip != NULL) &&
(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
clcount++;
error = xfs_iflush_int(iq, bp);
if (error) {
xfs_iunlock(iq,
XFS_ILOCK_SHARED);
goto cluster_corrupt_out;
}
} else {
xfs_ifunlock(iq);
}
} else {
xfs_ifunlock(iq);
}
}
xfs_iunlock(iq, XFS_ILOCK_SHARED);
}
}
spin_unlock(&ip->i_cluster->icl_lock);
if (clcount) {
XFS_STATS_INC(xs_icluster_flushcnt);
XFS_STATS_ADD(xs_icluster_flushinode, clcount);
}
if (flags & INT_DELWRI) { if (flags & INT_DELWRI) {
xfs_bdwrite(mp, bp); xfs_bdwrite(mp, bp);
@ -3230,52 +3309,11 @@ xfs_iflush(
corrupt_out: corrupt_out:
xfs_buf_relse(bp); xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
xfs_iflush_abort(ip);
/*
* Unlocks the flush lock
*/
return XFS_ERROR(EFSCORRUPTED);
cluster_corrupt_out: cluster_corrupt_out:
/* Corruption detected in the clustering loop. Invalidate the
* inode buffer and shut down the filesystem.
*/
spin_unlock(&ip->i_cluster->icl_lock);
/*
* Clean up the buffer. If it was B_DELWRI, just release it --
* brelse can handle it with no problems. If not, shut down the
* filesystem before releasing the buffer.
*/
if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
xfs_buf_relse(bp);
}
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
if(!bufwasdelwri) {
/*
* Just like incore_relse: if we have b_iodone functions,
* mark the buffer as an error and call them. Otherwise
* mark it as stale and brelse.
*/
if (XFS_BUF_IODONE_FUNC(bp)) {
XFS_BUF_CLR_BDSTRAT_FUNC(bp);
XFS_BUF_UNDONE(bp);
XFS_BUF_STALE(bp);
XFS_BUF_SHUT(bp);
XFS_BUF_ERROR(bp,EIO);
xfs_biodone(bp);
} else {
XFS_BUF_STALE(bp);
xfs_buf_relse(bp);
}
}
xfs_iflush_abort(iq);
/* /*
* Unlocks the flush lock * Unlocks the flush lock
*/ */
xfs_iflush_abort(ip);
return XFS_ERROR(EFSCORRUPTED); return XFS_ERROR(EFSCORRUPTED);
} }

View File

@ -132,19 +132,6 @@ typedef struct dm_attrs_s {
__uint16_t da_pad; /* DMIG extra padding */ __uint16_t da_pad; /* DMIG extra padding */
} dm_attrs_t; } dm_attrs_t;
/*
* This is the xfs inode cluster structure. This structure is used by
* xfs_iflush to find inodes that share a cluster and can be flushed to disk at
* the same time.
*/
typedef struct xfs_icluster {
struct hlist_head icl_inodes; /* list of inodes on cluster */
xfs_daddr_t icl_blkno; /* starting block number of
* the cluster */
struct xfs_buf *icl_buf; /* the inode buffer */
spinlock_t icl_lock; /* inode list lock */
} xfs_icluster_t;
/* /*
* This is the xfs in-core inode structure. * This is the xfs in-core inode structure.
* Most of the on-disk inode is embedded in the i_d field. * Most of the on-disk inode is embedded in the i_d field.
@ -248,8 +235,6 @@ typedef struct xfs_inode {
unsigned int i_delayed_blks; /* count of delay alloc blks */ unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */ xfs_icdinode_t i_d; /* most of ondisk inode */
xfs_icluster_t *i_cluster; /* cluster list header */
struct hlist_node i_cnode; /* cluster link node */
xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_size; /* in-memory size */
xfs_fsize_t i_new_size; /* size when write completes */ xfs_fsize_t i_new_size; /* size when write completes */
@ -594,7 +579,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#define xfs_inobp_check(mp, bp) #define xfs_inobp_check(mp, bp)
#endif /* DEBUG */ #endif /* DEBUG */
extern struct kmem_zone *xfs_icluster_zone;
extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_ifork_zone;
extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_inode_zone;
extern struct kmem_zone *xfs_ili_zone; extern struct kmem_zone *xfs_ili_zone;

View File

@ -112,9 +112,6 @@ xfs_init(void)
xfs_ili_zone = xfs_ili_zone =
kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
KM_ZONE_SPREAD, NULL); KM_ZONE_SPREAD, NULL);
xfs_icluster_zone =
kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
KM_ZONE_SPREAD, NULL);
/* /*
* Allocate global trace buffers. * Allocate global trace buffers.
@ -152,7 +149,6 @@ xfs_cleanup(void)
extern kmem_zone_t *xfs_inode_zone; extern kmem_zone_t *xfs_inode_zone;
extern kmem_zone_t *xfs_efd_zone; extern kmem_zone_t *xfs_efd_zone;
extern kmem_zone_t *xfs_efi_zone; extern kmem_zone_t *xfs_efi_zone;
extern kmem_zone_t *xfs_icluster_zone;
xfs_cleanup_procfs(); xfs_cleanup_procfs();
xfs_sysctl_unregister(); xfs_sysctl_unregister();
@ -187,7 +183,6 @@ xfs_cleanup(void)
kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_efi_zone);
kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_ifork_zone);
kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_ili_zone);
kmem_zone_destroy(xfs_icluster_zone);
} }
/* /*