From d757762bf2f6aea954745c76b4d767067b85be9d Mon Sep 17 00:00:00 2001 From: Donald Douwsma Date: Fri, 23 Nov 2007 16:27:42 +1100 Subject: [PATCH 1/8] [XFS] Fix dbflush panic in xfs_qm_sync. The recent behaviour layer removal dropped the check for quotas that have been requested at mount time but have subsequently been turned off. This results in a panic when accessing m_quotainfo which has been freed. This patch adds the check originally made by xfs_qm_syncall() to xfs_qm_sync(). SGI-PV: 969769 SGI-Modid: xfs-linux-melb:xfs-kern:29908a Signed-off-by: Donald Douwsma Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/quota/xfs_qm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index b5f91281b707..d488645f833d 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1008,6 +1008,9 @@ xfs_qm_sync( boolean_t nowait; int error; + if (! XFS_IS_QUOTA_ON(mp)) + return 0; + restarts = 0; /* * We won't block unless we are asked to. From cd57e594adc624dd9ee4c0ded3949da21ec24b2f Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Fri, 23 Nov 2007 16:30:32 +1100 Subject: [PATCH 2/8] [XFS] 971064 Various fixups for xfs_bulkstat(). - sanity check for NULL user buffer in xfs_ioc_bulkstat[_compat]() - remove the special case for XFS_IOC_FSBULKSTAT with count == 1. This special case causes bulkstat to fail because the special case uses xfs_bulkstat_single() instead of xfs_bulkstat() and the two functions have different semantics. xfs_bulkstat() will return the next inode after the one supplied while skipping internal inodes (ie quota inodes). xfs_bulkstate_single() will only lookup the inode supplied and return an error if it is an internal inode. - in xfs_bulkstat(), need to initialise 'lastino' to the inode supplied so in cases were we return without examining any inodes the scan wont restart back at zero. - sanity check for valid *ubcountp values. Cannot sanity check for valid ubuffer here because some users of xfs_bulkstat() don't supply a buffer. - checks against 'ubleft' (the space left in the user's buffer) should be against 'statstruct_size' which is the supplied minimum object size. The mixture of checks against statstruct_size and 0 was one of the reasons we were skipping inodes. - if the formatter function returns BULKSTAT_RV_NOTHING and an error and the error is not ENOENT or EINVAL then we need to abort the scan. ENOENT is for inodes that are no longer valid and we just skip them. EINVAL is returned if we try to lookup an internal inode so we skip them too. For a DMF scan if the inode and DMF attribute cannot fit into the space left in the user's buffer it would return ERANGE. We didn't handle this error and skipped the inode. We would continue to skip inodes until one fitted into the user's buffer or we completed the scan. - put back the recalculation of agino (that got removed with the last fix) at the end of the while loop. This is because the code at the start of the loop expects agino to be the last inode examined if it is non-zero. - if we found some inodes but then encountered an error, return success this time and the error next time. If the formatter aborted with ENOMEM we will now return this error but only if we couldn't read any inodes. Previously if we encountered ENOMEM without reading any inodes we returned a zero count and no error which falsely indicated the scan was complete. SGI-PV: 973431 SGI-Modid: xfs-linux-melb:xfs-kern:30089a Signed-off-by: Lachlan McIlroy Signed-off-by: David Chinner --- fs/xfs/linux-2.6/xfs_ioctl.c | 20 +++++++--------- fs/xfs/linux-2.6/xfs_ioctl32.c | 3 +++ fs/xfs/xfs_itable.c | 43 +++++++++++++++++++++++----------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 2b34bad48b07..98a56568bb24 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -1047,24 +1047,20 @@ xfs_ioc_bulkstat( if ((count = bulkreq.icount) <= 0) return -XFS_ERROR(EINVAL); + if (bulkreq.ubuffer == NULL) + return -XFS_ERROR(EINVAL); + if (cmd == XFS_IOC_FSINUMBERS) error = xfs_inumbers(mp, &inlast, &count, bulkreq.ubuffer, xfs_inumbers_fmt); else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) error = xfs_bulkstat_single(mp, &inlast, bulkreq.ubuffer, &done); - else { /* XFS_IOC_FSBULKSTAT */ - if (count == 1 && inlast != 0) { - inlast++; - error = xfs_bulkstat_single(mp, &inlast, - bulkreq.ubuffer, &done); - } else { - error = xfs_bulkstat(mp, &inlast, &count, - (bulkstat_one_pf)xfs_bulkstat_one, NULL, - sizeof(xfs_bstat_t), bulkreq.ubuffer, - BULKSTAT_FG_QUICK, &done); - } - } + else /* XFS_IOC_FSBULKSTAT */ + error = xfs_bulkstat(mp, &inlast, &count, + (bulkstat_one_pf)xfs_bulkstat_one, NULL, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + BULKSTAT_FG_QUICK, &done); if (error) return -error; diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 0046bdd5b7f1..bf2a956b63c2 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -291,6 +291,9 @@ xfs_ioc_bulkstat_compat( if ((count = bulkreq.icount) <= 0) return -XFS_ERROR(EINVAL); + if (bulkreq.ubuffer == NULL) + return -XFS_ERROR(EINVAL); + if (cmd == XFS_IOC_FSINUMBERS) error = xfs_inumbers(mp, &inlast, &count, bulkreq.ubuffer, xfs_inumbers_fmt_compat); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 9972992fd3c3..9fc4c2886529 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -316,6 +316,8 @@ xfs_bulkstat_use_dinode( return 1; } +#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) + /* * Return stat information in bulk (by-inode) for the filesystem. */ @@ -353,7 +355,7 @@ xfs_bulkstat( xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ - xfs_ino_t lastino=0; /* last inode number returned */ + xfs_ino_t lastino; /* last inode number returned */ int nbcluster; /* # of blocks in a cluster */ int nicluster; /* # of inodes in a cluster */ int nimask; /* mask for inode clusters */ @@ -373,6 +375,7 @@ xfs_bulkstat( * Get the last inode value, see if there's nothing to do. */ ino = (xfs_ino_t)*lastinop; + lastino = ino; dip = NULL; agno = XFS_INO_TO_AGNO(mp, ino); agino = XFS_INO_TO_AGINO(mp, ino); @@ -382,6 +385,9 @@ xfs_bulkstat( *ubcountp = 0; return 0; } + if (!ubcountp || *ubcountp <= 0) { + return EINVAL; + } ubcount = *ubcountp; /* statstruct's */ ubleft = ubcount * statstruct_size; /* bytes */ *ubcountp = ubelem = 0; @@ -402,7 +408,8 @@ xfs_bulkstat( * inode returned; 0 means start of the allocation group. */ rval = 0; - while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) { + while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { + cond_resched(); bp = NULL; down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); @@ -499,6 +506,7 @@ xfs_bulkstat( break; error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); + cond_resched(); } /* * If ran off the end of the ag either with an error, @@ -542,6 +550,7 @@ xfs_bulkstat( */ agino = gino + XFS_INODES_PER_CHUNK; error = xfs_inobt_increment(cur, 0, &tmp); + cond_resched(); } /* * Drop the btree buffers and the agi buffer. @@ -555,12 +564,12 @@ xfs_bulkstat( */ irbufend = irbp; for (irbp = irbuf; - irbp < irbufend && ubleft >= statstruct_size; irbp++) { + irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { /* * Now process this chunk of inodes. */ for (agino = irbp->ir_startino, chunkidx = clustidx = 0; - ubleft > 0 && + XFS_BULKSTAT_UBLEFT(ubleft) && irbp->ir_freecount < XFS_INODES_PER_CHUNK; chunkidx++, clustidx++, agino++) { ASSERT(chunkidx < XFS_INODES_PER_CHUNK); @@ -663,15 +672,13 @@ xfs_bulkstat( ubleft, private_data, bno, &ubused, dip, &fmterror); if (fmterror == BULKSTAT_RV_NOTHING) { - if (error == EFAULT) { - ubleft = 0; - rval = error; - break; - } - else if (error == ENOMEM) + if (error && error != ENOENT && + error != EINVAL) { ubleft = 0; - else - lastino = ino; + rval = error; + break; + } + lastino = ino; continue; } if (fmterror == BULKSTAT_RV_GIVEUP) { @@ -686,6 +693,8 @@ xfs_bulkstat( ubelem++; lastino = ino; } + + cond_resched(); } if (bp) @@ -694,11 +703,12 @@ xfs_bulkstat( /* * Set up for the next loop iteration. */ - if (ubleft > 0) { + if (XFS_BULKSTAT_UBLEFT(ubleft)) { if (end_of_ag) { agno++; agino = 0; - } + } else + agino = XFS_INO_TO_AGINO(mp, lastino); } else break; } @@ -707,6 +717,11 @@ xfs_bulkstat( */ kmem_free(irbuf, irbsize); *ubcountp = ubelem; + /* + * Found some inodes, return them now and return the error next time. + */ + if (ubelem) + rval = 0; if (agno >= mp->m_sb.sb_agcount) { /* * If we ran out of filesystem, mark lastino as off From d1afb678ce77b930334a8a640a05b8e68178a377 Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Tue, 27 Nov 2007 17:01:24 +1100 Subject: [PATCH 3/8] [XFS] Fixed a few bugs in xfs_buf_associate_memory() - calculation of 'page_count' was incorrect as it did not consider the offset of 'mem' into the first page. The logic to bump 'page_count' didn't work if 'len' was <= PAGE_CACHE_SIZE (ie offset = 3k, len = 2k). - setting b_buffer_length to 'len' is incorrect if 'offset' is > 0. Set it to the total length of the buffer. - I suspect that passing a non-aligned address into mem_to_page() for the first page may have been causing issues - don't know but just tidy up that code anyway. SGI-PV: 971596 SGI-Modid: xfs-linux-melb:xfs-kern:30143a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_buf.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index b9c8589e05c2..48bf477cbca5 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -725,15 +725,15 @@ xfs_buf_associate_memory( { int rval; int i = 0; - size_t ptr; - size_t end, end_cur; - off_t offset; + unsigned long pageaddr; + unsigned long offset; + size_t buflen; int page_count; - page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; - offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); - if (offset && (len > PAGE_CACHE_SIZE)) - page_count++; + pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; + offset = (unsigned long)mem - pageaddr; + buflen = PAGE_CACHE_ALIGN(len + offset); + page_count = buflen >> PAGE_CACHE_SHIFT; /* Free any previous set of page pointers */ if (bp->b_pages) @@ -747,22 +747,15 @@ xfs_buf_associate_memory( return rval; bp->b_offset = offset; - ptr = (size_t) mem & PAGE_CACHE_MASK; - end = PAGE_CACHE_ALIGN((size_t) mem + len); - end_cur = end; - /* set up first page */ - bp->b_pages[0] = mem_to_page(mem); - ptr += PAGE_CACHE_SIZE; - bp->b_page_count = ++i; - while (ptr < end) { - bp->b_pages[i] = mem_to_page((void *)ptr); - bp->b_page_count = ++i; - ptr += PAGE_CACHE_SIZE; + for (i = 0; i < bp->b_page_count; i++) { + bp->b_pages[i] = mem_to_page((void *)pageaddr); + pageaddr += PAGE_CACHE_SIZE; } bp->b_locked = 0; - bp->b_count_desired = bp->b_buffer_length = len; + bp->b_count_desired = len; + bp->b_buffer_length = buflen; bp->b_flags |= XBF_MAPPED; return 0; From 77be55a5a13d9c7ddf780a93861f2fba33f8be1a Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Fri, 23 Nov 2007 16:31:00 +1100 Subject: [PATCH 4/8] [XFS] Clear XBF_READ_AHEAD flag on I/O completion. SGI-PV: 972554 SGI-Modid: xfs-linux-melb:xfs-kern:30128a Signed-off-by: Lachlan McIlroy Signed-off-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 48bf477cbca5..43d6c7a290e2 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1025,7 +1025,7 @@ xfs_buf_ioend( xfs_buf_t *bp, int schedule) { - bp->b_flags &= ~(XBF_READ | XBF_WRITE); + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); if (bp->b_error == 0) bp->b_flags |= XBF_DONE; From a7430847fcb19297d6db833f35b9c9645c4a6395 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Fri, 23 Nov 2007 16:30:23 +1100 Subject: [PATCH 5/8] [XFS] Fix broken inode cluster setup. The radix tree based inode caches did away with the inode cluster hashes, replacing them with a bunch of masking and gang lookups on the radix tree. This masking got broken when moving the code to per-ag radix trees and indexing by agino # rather than straight inode number. The result is clustered inode writeback does not cluster and things can go extremely slowly when there are lots of inodes to write. Fix it up by comparing the agino # of the inode we just looked up to the index of the cluster we are looking for. Tested-by: Torsten Kaiser SGI-PV: 972915 SGI-Modid: xfs-linux-melb:xfs-kern:30033a Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_iget.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 488836e204a3..fb69ef180b27 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -267,7 +267,7 @@ finish_inode: icl = NULL; if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq, first_index, 1)) { - if ((iq->i_ino & mask) == first_index) + if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index) icl = iq->i_cluster; } From e89bc612d61edbcefaeb6f2244f86c0f3ec89d23 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Dec 2007 14:07:53 +1100 Subject: [PATCH 6/8] [XFS] revert to double-buffering readdir The current readdir implementation deadlocks on a btree buffers locks because nfsd calls back into ->lookup from the filldir callback. The only short-term fix for this is to revert to the old inefficient double-buffering scheme. SGI-PV: 973377 SGI-Modid: xfs-linux-melb:xfs-kern:30201a Signed-off-by: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_file.c | 124 ++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index fb8dd34041eb..54c564693d93 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -218,6 +218,15 @@ xfs_vm_fault( } #endif /* CONFIG_XFS_DMAPI */ +/* + * Unfortunately we can't just use the clean and simple readdir implementation + * below, because nfs might call back into ->lookup from the filldir callback + * and that will deadlock the low-level btree code. + * + * Hopefully we'll find a better workaround that allows to use the optimal + * version at least for local readdirs for 2.6.25. + */ +#if 0 STATIC int xfs_file_readdir( struct file *filp, @@ -249,6 +258,121 @@ xfs_file_readdir( return -error; return 0; } +#else + +struct hack_dirent { + int namlen; + loff_t offset; + u64 ino; + unsigned int d_type; + char name[]; +}; + +struct hack_callback { + char *dirent; + size_t len; + size_t used; +}; + +STATIC int +xfs_hack_filldir( + void *__buf, + const char *name, + int namlen, + loff_t offset, + u64 ino, + unsigned int d_type) +{ + struct hack_callback *buf = __buf; + struct hack_dirent *de = (struct hack_dirent *)(buf->dirent + buf->used); + + if (buf->used + sizeof(struct hack_dirent) + namlen > buf->len) + return -EINVAL; + + de->namlen = namlen; + de->offset = offset; + de->ino = ino; + de->d_type = d_type; + memcpy(de->name, name, namlen); + buf->used += sizeof(struct hack_dirent) + namlen; + return 0; +} + +STATIC int +xfs_file_readdir( + struct file *filp, + void *dirent, + filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + xfs_inode_t *ip = XFS_I(inode); + struct hack_callback buf; + struct hack_dirent *de; + int error; + loff_t size; + int eof = 0; + xfs_off_t start_offset, curr_offset, offset; + + /* + * Try fairly hard to get memory + */ + buf.len = PAGE_CACHE_SIZE; + do { + buf.dirent = kmalloc(buf.len, GFP_KERNEL); + if (buf.dirent) + break; + buf.len >>= 1; + } while (buf.len >= 1024); + + if (!buf.dirent) + return -ENOMEM; + + curr_offset = filp->f_pos; + if (curr_offset == 0x7fffffff) + offset = 0xffffffff; + else + offset = filp->f_pos; + + while (!eof) { + int reclen; + start_offset = offset; + + buf.used = 0; + error = -xfs_readdir(ip, &buf, buf.len, &offset, + xfs_hack_filldir); + if (error || offset == start_offset) { + size = 0; + break; + } + + size = buf.used; + de = (struct hack_dirent *)buf.dirent; + while (size > 0) { + if (filldir(dirent, de->name, de->namlen, + curr_offset & 0x7fffffff, + de->ino, de->d_type)) { + goto done; + } + + reclen = sizeof(struct hack_dirent) + de->namlen; + size -= reclen; + curr_offset = de->offset /* & 0x7fffffff */; + de = (struct hack_dirent *)((char *)de + reclen); + } + } + + done: + if (!error) { + if (size == 0) + filp->f_pos = offset & 0x7fffffff; + else if (de) + filp->f_pos = curr_offset; + } + + kfree(buf.dirent); + return error; +} +#endif STATIC int xfs_file_mmap( From 978c7b2ff49597ab76ff7529a933bd366941ac25 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Dec 2007 14:09:02 +1100 Subject: [PATCH 7/8] [XFS] Make xfsbufd threads freezable Fix breakage caused by commit 831441862956fffa17b9801db37e6ea1650b0f69 that did not introduce the necessary call to set_freezable() in xfs/linux-2.6/xfs_buf.c . SGI-PV: 974224 SGI-Modid: xfs-linux-melb:xfs-kern:30203a Signed-off-by: Rafael J. Wysocki Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_buf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 43d6c7a290e2..a49dd8d4b069 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1743,6 +1743,8 @@ xfsbufd( current->flags |= PF_MEMALLOC; + set_freezable(); + do { if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); From cf10e82bdc0d38d09dfaf46d0daf56136138ef3f Mon Sep 17 00:00:00 2001 From: David Chinner Date: Fri, 7 Dec 2007 14:09:11 +1100 Subject: [PATCH 8/8] [XFS] Fix xfs_ichgtime()s broken usage of I_SYNC The recent I_LOCK->I_SYNC changes mistakenly changed xfs_ichgtime to look at I_SYNC instead of I_LOCK. This was incorrect and prevents newly created inodes from moving to the dirty list. Change this to the correct check which is for I_NEW, not I_LOCK or I_SYNC so that behaviour is correct. SGI-PV: 974225 SGI-Modid: xfs-linux-melb:xfs-kern:30204a Signed-off-by: David Chinner Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_iops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index ac50f8a37582..37e116779eb1 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -117,7 +117,7 @@ xfs_ichgtime( */ SYNCHRONIZE(); ip->i_update_core = 1; - if (!(inode->i_state & I_SYNC)) + if (!(inode->i_state & I_NEW)) mark_inode_dirty_sync(inode); } @@ -169,7 +169,7 @@ xfs_ichgtime_fast( */ SYNCHRONIZE(); ip->i_update_core = 1; - if (!(inode->i_state & I_SYNC)) + if (!(inode->i_state & I_NEW)) mark_inode_dirty_sync(inode); }