xfs: fix rmap btree key flag handling [v24.5]

This series fixes numerous flag handling bugs in the rmapbt key code.
 The most serious transgression is that key comparisons completely strip
 out all flag bits from rm_offset, including the ones that participate in
 record lookups.  The second problem is that for years we've been letting
 the unwritten flag (which is an attribute of a specific record and not
 part of the record key) escape from leaf records into key records.
 
 The solution to the second problem is to filter attribute flags when
 creating keys from records, and the solution to the first problem is to
 preserve *only* the flags used for key lookups.  The ATTR and BMBT flags
 are a part of the lookup key, and the UNWRITTEN flag is a record
 attribute.
 
 This has worked for years without generating user complaints because
 ATTR and BMBT extents cannot be shared, so key comparisons succeed
 solely on rm_startblock.  Only file data fork extents can be shared, and
 those records never set any of the three flag bits, so comparisons that
 dig into rm_owner and rm_offset work just fine.
 
 A filesystem written with an unpatched kernel and mounted on a patched
 kernel will work correctly because the ATTR/BMBT flags have been
 conveyed into keys correctly all along, and we still ignore the
 UNWRITTEN flag in any key record.  This was what doomed my previous
 attempt to correct this problem in 2019.
 
 A filesystem written with a patched kernel and mounted on an unpatched
 kernel will also work correctly because unpatched kernels ignore all
 flags.
 
 With this patchset applied, the scrub code gains the ability to detect
 rmap btrees with incorrectly set attr and bmbt flags in the key records.
 After three years of testing, I haven't encountered any problems.
 
 Signed-off-by: Darrick J. Wong <djwong@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQQ2qTKExjcn+O1o2YRKO3ySh0YRpgUCZDYdoQAKCRBKO3ySh0YR
 prUmAP9WiaLPxeMAnQiQcaZyqyAhaiqbwNoLkDMx0+1+SKDPCwD7BU6tPQpT039i
 mrDag3g2x4N7g/e89N29SQp8EDGuQQQ=
 =Chkt
 -----END PGP SIGNATURE-----

Merge tag 'rmap-btree-fix-key-handling-6.4_2023-04-11' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into guilt/xfs-for-next

xfs: fix rmap btree key flag handling [v24.5]

This series fixes numerous flag handling bugs in the rmapbt key code.
The most serious transgression is that key comparisons completely strip
out all flag bits from rm_offset, including the ones that participate in
record lookups.  The second problem is that for years we've been letting
the unwritten flag (which is an attribute of a specific record and not
part of the record key) escape from leaf records into key records.

The solution to the second problem is to filter attribute flags when
creating keys from records, and the solution to the first problem is to
preserve *only* the flags used for key lookups.  The ATTR and BMBT flags
are a part of the lookup key, and the UNWRITTEN flag is a record
attribute.

This has worked for years without generating user complaints because
ATTR and BMBT extents cannot be shared, so key comparisons succeed
solely on rm_startblock.  Only file data fork extents can be shared, and
those records never set any of the three flag bits, so comparisons that
dig into rm_owner and rm_offset work just fine.

A filesystem written with an unpatched kernel and mounted on a patched
kernel will work correctly because the ATTR/BMBT flags have been
conveyed into keys correctly all along, and we still ignore the
UNWRITTEN flag in any key record.  This was what doomed my previous
attempt to correct this problem in 2019.

A filesystem written with a patched kernel and mounted on an unpatched
kernel will also work correctly because unpatched kernels ignore all
flags.

With this patchset applied, the scrub code gains the ability to detect
rmap btrees with incorrectly set attr and bmbt flags in the key records.
After three years of testing, I haven't encountered any problems.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
This commit is contained in:
Dave Chinner 2023-04-14 07:09:36 +10:00 committed by Dave Chinner
commit 1ee7550532
4 changed files with 95 additions and 10 deletions

View File

@ -156,6 +156,16 @@ xfs_rmapbt_get_maxrecs(
return cur->bc_mp->m_rmap_mxr[level != 0];
}
/*
* Convert the ondisk record's offset field into the ondisk key's offset field.
* Fork and bmbt are significant parts of the rmap record key, but written
* status is merely a record attribute.
*/
static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
{
return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
}
STATIC void
xfs_rmapbt_init_key_from_rec(
union xfs_btree_key *key,
@ -163,7 +173,7 @@ xfs_rmapbt_init_key_from_rec(
{
key->rmap.rm_startblock = rec->rmap.rm_startblock;
key->rmap.rm_owner = rec->rmap.rm_owner;
key->rmap.rm_offset = rec->rmap.rm_offset;
key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
}
/*
@ -186,7 +196,7 @@ xfs_rmapbt_init_high_key_from_rec(
key->rmap.rm_startblock = rec->rmap.rm_startblock;
be32_add_cpu(&key->rmap.rm_startblock, adj);
key->rmap.rm_owner = rec->rmap.rm_owner;
key->rmap.rm_offset = rec->rmap.rm_offset;
key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
return;
@ -219,6 +229,16 @@ xfs_rmapbt_init_ptr_from_cur(
ptr->s = agf->agf_roots[cur->bc_btnum];
}
/*
* Mask the appropriate parts of the ondisk key field for a key comparison.
* Fork and bmbt are significant parts of the rmap record key, but written
* status is merely a record attribute.
*/
static inline uint64_t offset_keymask(uint64_t offset)
{
return offset & ~XFS_RMAP_OFF_UNWRITTEN;
}
STATIC int64_t
xfs_rmapbt_key_diff(
struct xfs_btree_cur *cur,
@ -240,8 +260,8 @@ xfs_rmapbt_key_diff(
else if (y > x)
return -1;
x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
y = rec->rm_offset;
x = offset_keymask(be64_to_cpu(kp->rm_offset));
y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
if (x > y)
return 1;
else if (y > x)
@ -272,8 +292,8 @@ xfs_rmapbt_diff_two_keys(
else if (y > x)
return -1;
x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
x = offset_keymask(be64_to_cpu(kp1->rm_offset));
y = offset_keymask(be64_to_cpu(kp2->rm_offset));
if (x > y)
return 1;
else if (y > x)
@ -387,8 +407,8 @@ xfs_rmapbt_keys_inorder(
return 1;
else if (a > b)
return 0;
a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
if (a <= b)
return 1;
return 0;
@ -417,8 +437,8 @@ xfs_rmapbt_recs_inorder(
return 1;
else if (a > b)
return 0;
a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
if (a <= b)
return 1;
return 0;

View File

@ -119,6 +119,16 @@ xchk_btree_xref_set_corrupt(
__return_address);
}
void
xchk_btree_set_preen(
struct xfs_scrub *sc,
struct xfs_btree_cur *cur,
int level)
{
__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
__return_address);
}
/*
* Make sure this record is in order and doesn't stray outside of the parent
* keys.

View File

@ -19,6 +19,8 @@ bool xchk_btree_xref_process_error(struct xfs_scrub *sc,
/* Check for btree corruption. */
void xchk_btree_set_corrupt(struct xfs_scrub *sc,
struct xfs_btree_cur *cur, int level);
void xchk_btree_set_preen(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
int level);
/* Check for btree xref discrepancies. */
void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc,

View File

@ -87,6 +87,58 @@ xchk_rmapbt_xref(
xchk_rmapbt_xref_refc(sc, irec);
}
/*
* Check for bogus UNWRITTEN flags in the rmapbt node block keys.
*
* In reverse mapping records, the file mapping extent state
* (XFS_RMAP_OFF_UNWRITTEN) is a record attribute, not a key field. It is not
* involved in lookups in any way. In older kernels, the functions that
* convert rmapbt records to keys forgot to filter out the extent state bit,
* even though the key comparison functions have filtered the flag correctly.
* If we spot an rmap key with the unwritten bit set in rm_offset, we should
* mark the btree as needing optimization to rebuild the btree without those
* flags.
*/
STATIC void
xchk_rmapbt_check_unwritten_in_keyflags(
struct xchk_btree *bs)
{
struct xfs_scrub *sc = bs->sc;
struct xfs_btree_cur *cur = bs->cur;
struct xfs_btree_block *keyblock;
union xfs_btree_key *lkey, *hkey;
__be64 badflag = cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
unsigned int level;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN)
return;
for (level = 1; level < cur->bc_nlevels; level++) {
struct xfs_buf *bp;
unsigned int ptr;
/* Only check the first time we've seen this node block. */
if (cur->bc_levels[level].ptr > 1)
continue;
keyblock = xfs_btree_get_block(cur, level, &bp);
for (ptr = 1; ptr <= be16_to_cpu(keyblock->bb_numrecs); ptr++) {
lkey = xfs_btree_key_addr(cur, ptr, keyblock);
if (lkey->rmap.rm_offset & badflag) {
xchk_btree_set_preen(sc, cur, level);
break;
}
hkey = xfs_btree_high_key_addr(cur, ptr, keyblock);
if (hkey->rmap.rm_offset & badflag) {
xchk_btree_set_preen(sc, cur, level);
break;
}
}
}
}
/* Scrub an rmapbt record. */
STATIC int
xchk_rmapbt_rec(
@ -101,6 +153,7 @@ xchk_rmapbt_rec(
return 0;
}
xchk_rmapbt_check_unwritten_in_keyflags(bs);
xchk_rmapbt_xref(bs->sc, &irec);
return 0;
}